Open-Meteo maintains an API for historical weather that allows for non-commercial usage of historical weather data maintained by the website.
This file runs exploratory analysis on some of the historical weather data.
The exploration process uses tidyverse and several generic custom functions:
library(tidyverse) # tidyverse functionality is included throughout
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
source("./Generic_Added_Utility_Functions_202105_v001.R") # Basic functions
A sample of data for 365 days has been downloaded as a CSV. The downloaded data has three separate files included in a single tab, separated by a blank row. The first file is location data, the second file is hourly data, and the third file is daily data. For initial exploration, parameters specific to this file are used:
omFileLoc <- "./RInputFiles/openmeteo_20230612_example.csv"
# Location data
omLocation <- readr::read_csv(omFileLoc, n_max=1, skip=0)
## Rows: 1 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): timezone, timezone_abbreviation
## dbl (4): latitude, longitude, elevation, utc_offset_seconds
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
omLocation
## # A tibble: 1 × 6
## latitude longitude elevation utc_offset_seconds timezone timezone_abb…¹
## <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 41.8 -87.6 179 -18000 America/Chicago CDT
## # … with abbreviated variable name ¹timezone_abbreviation
# Hourly data
# Elements: time, 2m temp (C), 2m dew point (C), 2m relative humidity (%), precip (mm), rain (mm), and snow (cm)
omHourlyRaw <- readr::read_csv(omFileLoc, n_max=8760, skip=3)
## Rows: 8760 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): temperature_2m (°C), relativehumidity_2m (%), dewpoint_2m (°C), pr...
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
omHourlyProcess <- omHourlyRaw %>%
purrr::set_names(c("time", "temp2m_C", "relH2m", "dew2m_C", "precip_mm", "rain_mm", "snow_cm")) %>%
mutate(date=date(time))
omHourlyProcess
## # A tibble: 8,760 × 8
## time temp2…¹ relH2m dew2m_C preci…² rain_mm snow_cm date
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <date>
## 1 2022-06-08 00:00:00 13.4 91 11.9 0 0 0 2022-06-08
## 2 2022-06-08 01:00:00 13.4 91 12 0 0 0 2022-06-08
## 3 2022-06-08 02:00:00 13.8 87 11.8 0 0 0 2022-06-08
## 4 2022-06-08 03:00:00 13.8 87 11.7 0 0 0 2022-06-08
## 5 2022-06-08 04:00:00 14 85 11.6 0 0 0 2022-06-08
## 6 2022-06-08 05:00:00 14.4 82 11.3 0 0 0 2022-06-08
## 7 2022-06-08 06:00:00 14.8 79 11.1 0 0 0 2022-06-08
## 8 2022-06-08 07:00:00 15.1 77 11.1 0.1 0.1 0 2022-06-08
## 9 2022-06-08 08:00:00 15.7 75 11.2 0 0 0 2022-06-08
## 10 2022-06-08 09:00:00 16.4 72 11.3 0 0 0 2022-06-08
## # … with 8,750 more rows, and abbreviated variable names ¹temp2m_C, ²precip_mm
summary(omHourlyProcess)
## time temp2m_C relH2m
## Min. :2022-06-08 00:00:00 Min. :-20.60 Min. : 32.00
## 1st Qu.:2022-09-07 05:45:00 1st Qu.: 2.80 1st Qu.: 62.00
## Median :2022-12-07 11:30:00 Median : 10.30 Median : 72.00
## Mean :2022-12-07 11:30:00 Mean : 10.81 Mean : 72.38
## 3rd Qu.:2023-03-08 17:15:00 3rd Qu.: 19.80 3rd Qu.: 83.00
## Max. :2023-06-07 23:00:00 Max. : 31.50 Max. :100.00
## NA's :53 NA's :53
## dew2m_C precip_mm rain_mm snow_cm
## Min. :-24.300 Min. : 0.00000 Min. : 0.00000 Min. :0.00000
## 1st Qu.: -1.400 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.:0.00000
## Median : 5.500 Median : 0.00000 Median : 0.00000 Median :0.00000
## Mean : 5.792 Mean : 0.09986 Mean : 0.09167 Mean :0.00573
## 3rd Qu.: 14.700 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.:0.00000
## Max. : 24.200 Max. :11.10000 Max. :11.10000 Max. :1.26000
## NA's :53 NA's :53 NA's :53 NA's :53
## date
## Min. :2022-06-08
## 1st Qu.:2022-09-07
## Median :2022-12-07
## Mean :2022-12-07
## 3rd Qu.:2023-03-08
## Max. :2023-06-07
##
# Daily data
# Elements: date, sum of precip (mm), sum of rain (mm), and sum of snow (cm)
omDailyRaw <- readr::read_csv(omFileLoc, n_max=365, skip=8765)
## Rows: 365 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): precipitation_sum (mm), rain_sum (mm), snowfall_sum (cm)
## date (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
omDailyProcess <- omDailyRaw %>%
purrr::set_names(c("date", "precip_mm", "rain_mm", "snow_cm"))
omDailyProcess
## # A tibble: 365 × 4
## date precip_mm rain_mm snow_cm
## <date> <dbl> <dbl> <dbl>
## 1 2022-06-08 16 16 0
## 2 2022-06-09 0 0 0
## 3 2022-06-10 0.6 0.6 0
## 4 2022-06-11 0 0 0
## 5 2022-06-12 1.3 1.3 0
## 6 2022-06-13 2.6 2.6 0
## 7 2022-06-14 0 0 0
## 8 2022-06-15 0 0 0
## 9 2022-06-16 9.5 9.5 0
## 10 2022-06-17 0 0 0
## # … with 355 more rows
summary(omDailyProcess)
## date precip_mm rain_mm snow_cm
## Min. :2022-06-08 Min. : 0.000 Min. : 0.000 Min. :0.0000
## 1st Qu.:2022-09-07 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:0.0000
## Median :2022-12-07 Median : 0.000 Median : 0.000 Median :0.0000
## Mean :2022-12-07 Mean : 2.402 Mean : 2.205 Mean :0.1379
## 3rd Qu.:2023-03-08 3rd Qu.: 1.875 3rd Qu.: 1.300 3rd Qu.:0.0000
## Max. :2023-06-07 Max. :40.000 Max. :40.000 Max. :6.6500
## NA's :3 NA's :3 NA's :3
A function is written to read a portion of a CSV file:
partialCSVRead <- function(loc, firstRow=1L, lastRow=+Inf, col_names=TRUE, ...) {
# FUNCTION arguments
# loc: file location
# firstRow: first row that is relevant to the partial file read (whether header line or data line)
# last Row: last row that is relevant to the partial file read (+Inf means read until last line of file)
# col_names: the col_names parameter passed to readr::read_csv
# TRUE means header=TRUE (get column names from file, read data starting on next line)
# FALSE means header=FALSE (auto-generate column names, read data starting on first line)
# character vector means use these as column names (read data starting on first line)
# ...: additional arguments passed to read_csv
# Read the file and return
# skip: rows to be skipped are all those prior to firstRow
# n_max: maximum rows read are lastRow-firstRow, with an additional data row when col_names is not TRUE
readr::read_csv(loc,
skip=firstRow-1,
n_max=lastRow-firstRow+ifelse(isTRUE(col_names), 0, 1),
...
)
}
# Double check that data are the same
partialCSVRead(omFileLoc, firstRow=1L, lastRow=2L) %>% all.equal(omLocation)
## Rows: 1 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): timezone, timezone_abbreviation
## dbl (4): latitude, longitude, elevation, utc_offset_seconds
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] TRUE
partialCSVRead(omFileLoc, firstRow=4L, lastRow=8764L) %>% all.equal(omHourlyRaw)
## Rows: 8760 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): temperature_2m (°C), relativehumidity_2m (%), dewpoint_2m (°C), pr...
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] TRUE
partialCSVRead(omFileLoc, firstRow=8766L, lastRow=+Inf) %>% all.equal(omDailyRaw)
## Rows: 365 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): precipitation_sum (mm), rain_sum (mm), snowfall_sum (cm)
## date (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] TRUE
The blank lines are assessed, allowing for all tables to be read at the same time:
# Get the break points for gaps in a vector (e.g., 0, 3, 5:8, 20 has break points 0, 3, 5, 20 and 0, 3, 8, 30)
vecGaps <- function(x, addElements=c(), sortUnique=TRUE) {
if(length(addElements)>0) x <- c(addElements, x)
if(isTRUE(sortUnique)) x <- unique(sort(x))
list("starts"=c(x[is.na(lag(x)) | x-lag(x)>1], +Inf),
"ends"=x[is.na(lead(x)) | lead(x)-x>1]
)
}
vecGaps(c(3, 5:8, 20), addElements=0)
## $starts
## [1] 0 3 5 20 Inf
##
## $ends
## [1] 0 3 8 20
# Find the break points in a single file
flatFileGaps <- function(loc) {
which(stringr::str_length(readLines(loc))==0) %>% vecGaps(addElements=0)
}
flatFileGaps(omFileLoc)
## $starts
## [1] 0 3 8765 Inf
##
## $ends
## [1] 0 3 8765
# Read all relevant data as CSV with header
readMultiCSV <- function(loc, col_names=TRUE, ...) {
gaps <- flatFileGaps(loc)
lapply(seq_along(gaps$ends),
FUN=function(x) partialCSVRead(loc,
firstRow=gaps$ends[x]+1,
lastRow=gaps$starts[x+1]-1,
col_names=col_names,
...
)
)
}
tstMultiCSV <- readMultiCSV(omFileLoc)
## Rows: 1 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): timezone, timezone_abbreviation
## dbl (4): latitude, longitude, elevation, utc_offset_seconds
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 8760 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): temperature_2m (°C), relativehumidity_2m (%), dewpoint_2m (°C), pr...
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 365 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): precipitation_sum (mm), rain_sum (mm), snowfall_sum (cm)
## date (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all.equal(tstMultiCSV[[1]], omLocation)
## [1] TRUE
all.equal(tstMultiCSV[[2]], omHourlyRaw)
## [1] TRUE
all.equal(tstMultiCSV[[3]], omDailyRaw)
## [1] TRUE
Data can also be downloaded through the Open-Meteo API, returning a JSON file. The data download has been completed off-line to minimize repeated hits against the server. The JSON file can then be read:
# Example download sequence
# download.file("https://archive-api.open-meteo.com/v1/archive?latitude=41.85&longitude=-87.65&start_date=2022-06-01&end_date=2023-06-08&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall&daily=precipitation_sum,rain_sum,snowfall_sum&timezone=America%2FChicago", "tempOM")
# Create hourly data tibble
jsonHourly <- jsonlite::read_json("tempOM", simplifyVector = TRUE)[["hourly"]] %>%
tibble::as_tibble() %>%
mutate(tm=lubridate::ymd_hm(time), date=date(tm))
jsonHourly
## # A tibble: 8,952 × 9
## time tempe…¹ relat…² dewpo…³ preci…⁴ rain snowf…⁵ tm
## <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dttm>
## 1 2022-06-01… 21 92 19.6 0.1 0.1 0 2022-06-01 00:00:00
## 2 2022-06-01… 20.6 93 19.5 0.3 0.3 0 2022-06-01 01:00:00
## 3 2022-06-01… 21 93 19.8 0 0 0 2022-06-01 02:00:00
## 4 2022-06-01… 20.8 93 19.7 0 0 0 2022-06-01 03:00:00
## 5 2022-06-01… 20.5 93 19.4 0 0 0 2022-06-01 04:00:00
## 6 2022-06-01… 19.8 95 19 0.7 0.7 0 2022-06-01 05:00:00
## 7 2022-06-01… 19.3 97 18.8 1.6 1.6 0 2022-06-01 06:00:00
## 8 2022-06-01… 19 97 18.4 1 1 0 2022-06-01 07:00:00
## 9 2022-06-01… 18.1 92 16.9 0.1 0.1 0 2022-06-01 08:00:00
## 10 2022-06-01… 16.8 87 14.6 0 0 0 2022-06-01 09:00:00
## # … with 8,942 more rows, 1 more variable: date <date>, and abbreviated
## # variable names ¹temperature_2m, ²relativehumidity_2m, ³dewpoint_2m,
## # ⁴precipitation, ⁵snowfall
# Create daily data tibble
jsonDaily <- jsonlite::read_json("tempOM", simplifyVector = TRUE)[["daily"]] %>%
tibble::as_tibble()
jsonDaily
## # A tibble: 373 × 4
## time precipitation_sum rain_sum snowfall_sum
## <chr> <dbl> <dbl> <dbl>
## 1 2022-06-01 3.8 3.8 0
## 2 2022-06-02 0 0 0
## 3 2022-06-03 0 0 0
## 4 2022-06-04 1.3 1.3 0
## 5 2022-06-05 0.3 0.3 0
## 6 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2 2 0
## 8 2022-06-08 16 16 0
## 9 2022-06-09 0 0 0
## 10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
# Extract other elements
jsonNames <- jsonlite::read_json("tempOM", simplifyVector = TRUE) %>% names
for (jsonName in jsonNames[!(jsonNames %in% c("daily", "hourly", "daily_units", "hourly_units"))]) {
cat("\n", jsonName, ":", jsonlite::read_json("tempOM", simplifyVector = TRUE)[[jsonName]])
}
##
## latitude : 41.8
## longitude : -87.6
## generationtime_ms : 2.892971
## utc_offset_seconds : -18000
## timezone : America/Chicago
## timezone_abbreviation : CDT
## elevation : 179
for (jsonName in jsonNames[jsonNames %in% c("daily_units", "hourly_units")]) {
cat("\n", jsonName, ":\n")
print(jsonlite::read_json("tempOM", simplifyVector = TRUE)[[jsonName]] %>% tibble::as_tibble() %>% t())
}
##
## hourly_units :
## [,1]
## time "iso8601"
## temperature_2m "°C"
## relativehumidity_2m "%"
## dewpoint_2m "°C"
## precipitation "mm"
## rain "mm"
## snowfall "cm"
##
## daily_units :
## [,1]
## time "iso8601"
## precipitation_sum "mm"
## rain_sum "mm"
## snowfall_sum "cm"
Daily data read from JSON and CSV are compared:
# Convert variable names in JSON daily data
jsonDailyProcess <- jsonDaily %>%
colRenamer(vecRename=c("precipitation_sum"="precip_mm",
"rain_sum"="rain_mm",
"snowfall_sum"="snow_cm",
"time"="date"
)
) %>%
mutate(date=as.Date(date))
jsonDailyProcess
## # A tibble: 373 × 4
## date precip_mm rain_mm snow_cm
## <date> <dbl> <dbl> <dbl>
## 1 2022-06-01 3.8 3.8 0
## 2 2022-06-02 0 0 0
## 3 2022-06-03 0 0 0
## 4 2022-06-04 1.3 1.3 0
## 5 2022-06-05 0.3 0.3 0
## 6 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2 2 0
## 8 2022-06-08 16 16 0
## 9 2022-06-09 0 0 0
## 10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
# Check dates included
omDailyProcess %>%
select(date) %>%
mutate(inCSV=1) %>%
full_join(mutate(select(jsonDailyProcess, "date"), inJSON=1), by="date") %>%
filter(!complete.cases(.))
## # A tibble: 8 × 3
## date inCSV inJSON
## <date> <dbl> <dbl>
## 1 2022-06-01 NA 1
## 2 2022-06-02 NA 1
## 3 2022-06-03 NA 1
## 4 2022-06-04 NA 1
## 5 2022-06-05 NA 1
## 6 2022-06-06 NA 1
## 7 2022-06-07 NA 1
## 8 2023-06-08 NA 1
# Check column names
all.equal(names(omDailyProcess), names(jsonDailyProcess))
## [1] TRUE
# Check data elements from 2022-06-08 through 2023-06-04 (last full day of data)
all.equal(omDailyProcess %>% tibble::as_tibble() %>% filter(date>="2022-06-08", date<="2023-06-04"),
jsonDailyProcess %>% filter(date>="2022-06-08", date<="2023-06-04")
)
## [1] TRUE
Hourly data read from JSON and CSV are compared:
# Convert variable names in JSON hourly data
jsonHourlyProcess <- jsonHourly %>%
select(-time) %>%
colRenamer(vecRename=c("temperature_2m"="temp2m_C",
"relativehumidity_2m"="relH2m",
"dewpoint_2m"="dew2m_C",
"precipitation"="precip_mm",
"rain"="rain_mm",
"snowfall"="snow_cm",
"tm"="time"
)
) %>%
select(time, everything())
jsonHourlyProcess
## # A tibble: 8,952 × 8
## time temp2…¹ relH2m dew2m_C preci…² rain_mm snow_cm date
## <dttm> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <date>
## 1 2022-06-01 00:00:00 21 92 19.6 0.1 0.1 0 2022-06-01
## 2 2022-06-01 01:00:00 20.6 93 19.5 0.3 0.3 0 2022-06-01
## 3 2022-06-01 02:00:00 21 93 19.8 0 0 0 2022-06-01
## 4 2022-06-01 03:00:00 20.8 93 19.7 0 0 0 2022-06-01
## 5 2022-06-01 04:00:00 20.5 93 19.4 0 0 0 2022-06-01
## 6 2022-06-01 05:00:00 19.8 95 19 0.7 0.7 0 2022-06-01
## 7 2022-06-01 06:00:00 19.3 97 18.8 1.6 1.6 0 2022-06-01
## 8 2022-06-01 07:00:00 19 97 18.4 1 1 0 2022-06-01
## 9 2022-06-01 08:00:00 18.1 92 16.9 0.1 0.1 0 2022-06-01
## 10 2022-06-01 09:00:00 16.8 87 14.6 0 0 0 2022-06-01
## # … with 8,942 more rows, and abbreviated variable names ¹temp2m_C, ²precip_mm
# Check dates included
omHourlyProcess %>%
count(date, name="nCSV") %>%
full_join(count(jsonHourlyProcess, date, name="nJSON"), by="date") %>%
filter(!complete.cases(.))
## # A tibble: 8 × 3
## date nCSV nJSON
## <date> <int> <int>
## 1 2022-06-01 NA 24
## 2 2022-06-02 NA 24
## 3 2022-06-03 NA 24
## 4 2022-06-04 NA 24
## 5 2022-06-05 NA 24
## 6 2022-06-06 NA 24
## 7 2022-06-07 NA 24
## 8 2023-06-08 NA 24
# Check column names
all.equal(names(omHourlyProcess), names(jsonHourlyProcess))
## [1] TRUE
# Check data elements from 2022-06-08 through 2023-06-04 (last full day of data)
all.equal(omHourlyProcess %>% tibble::as_tibble() %>% filter(date>="2022-06-08", date<="2023-06-04"),
jsonHourlyProcess %>% filter(date>="2022-06-08", date<="2023-06-04")
)
## [1] TRUE
Metrics that can be reuested for hourly and daily data include:
hourlyMetrics <- "temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm"
dailyMetrics <- "weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration"
hourlyDescription <- "Air temperature at 2 meters above ground\nRelative humidity at 2 meters above ground\nDew point temperature at 2 meters above ground\nApparent temperature is the perceived feels-like temperature combining wind chill factor, relative humidity and solar radiation\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nTotal precipitation (rain, showers, snow) sum of the preceding hour. Data is stored with a 0.1 mm precision. If precipitation data is summed up to monthly sums, there might be small inconsistencies with the total precipitation amount.\nOnly liquid precipitation of the preceding hour including local showers and rain from large scale systems.\nSnowfall amount of the preceding hour in centimeters. For the water equivalent in millimeter, divide by 7. E.g. 7 cm snow = 10 mm precipitation water equivalent\nTotal cloud cover as an area fraction\nLow level clouds and fog up to 2 km altitude\nMid level clouds from 2 to 6 km altitude\nHigh level clouds from 6 km altitude\nShortwave solar radiation as average of the preceding hour. This is equal to the total global horizontal irradiation\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDiffuse solar radiation as average of the preceding hour\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind direction at 10 or 100 meters above ground\nWind direction at 10 or 100 meters above ground\nGusts at 10 meters above ground of the indicated hour. Wind gusts in CERRA are defined as the maximum wind gusts of the preceding hour. Please consult the ECMWF IFS documentation for more information on how wind gusts are parameterized in weather models.\nET0 Reference Evapotranspiration of a well watered grass field. Based on FAO-56 Penman-Monteith equations ET0 is calculated from temperature, wind speed, humidity and solar radiation. Unlimited soil water is assumed. ET0 is commonly used to estimate the required irrigation for plants.\nWeather condition as a numeric code. Follow WMO weather interpretation codes. See table below for details. Weather code is calculated from cloud cover analysis, precipitation and snowfall. As barely no information about atmospheric stability is available, estimation about thunderstorms is not possible.\nVapor Pressure Deificit (VPD) in kilopascal (kPa). For high VPD (>1.6), water transpiration of plants increases. For low VPD (<0.4), transpiration decreases\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths."
dailyDescription <- "The most severe weather condition on a given day\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily apparent temperature\nMaximum and minimum daily apparent temperature\nSum of daily precipitation (including rain, showers and snowfall)\nSum of daily rain\nSum of daily snowfall\nThe number of hours with rain\nSun rise and set times\nSun rise and set times\nMaximum wind speed and gusts on a day\nMaximum wind speed and gusts on a day\nDominant wind direction\nThe sum of solar radiaion on a given day in Megajoules\nDaily sum of ET0 Reference Evapotranspiration of a well watered grass field"
# Create tibble for hourly metrics
tblMetricsHourly <- tibble::tibble(metric=hourlyMetrics %>% str_split_1(","),
description=hourlyDescription %>% str_split_1("\n")
)
tblMetricsHourly %>%
print(n=50)
## # A tibble: 33 × 2
## metric description
## <chr> <chr>
## 1 temperature_2m Air temperature at 2 meters above ground
## 2 relativehumidity_2m Relative humidity at 2 meters above ground
## 3 dewpoint_2m Dew point temperature at 2 meters above ground
## 4 apparent_temperature Apparent temperature is the perceived feels-li…
## 5 pressure_msl Atmospheric air pressure reduced to mean sea l…
## 6 surface_pressure Atmospheric air pressure reduced to mean sea l…
## 7 precipitation Total precipitation (rain, showers, snow) sum …
## 8 rain Only liquid precipitation of the preceding hou…
## 9 snowfall Snowfall amount of the preceding hour in centi…
## 10 cloudcover Total cloud cover as an area fraction
## 11 cloudcover_low Low level clouds and fog up to 2 km altitude
## 12 cloudcover_mid Mid level clouds from 2 to 6 km altitude
## 13 cloudcover_high High level clouds from 6 km altitude
## 14 shortwave_radiation Shortwave solar radiation as average of the pr…
## 15 direct_radiation Direct solar radiation as average of the prece…
## 16 direct_normal_irradiance Direct solar radiation as average of the prece…
## 17 diffuse_radiation Diffuse solar radiation as average of the prec…
## 18 windspeed_10m Wind speed at 10 or 100 meters above ground. W…
## 19 windspeed_100m Wind speed at 10 or 100 meters above ground. W…
## 20 winddirection_10m Wind direction at 10 or 100 meters above ground
## 21 winddirection_100m Wind direction at 10 or 100 meters above ground
## 22 windgusts_10m Gusts at 10 meters above ground of the indicat…
## 23 et0_fao_evapotranspiration ET0 Reference Evapotranspiration of a well wat…
## 24 weathercode Weather condition as a numeric code. Follow WM…
## 25 vapor_pressure_deficit Vapor Pressure Deificit (VPD) in kilopascal (k…
## 26 soil_temperature_0_to_7cm Average temperature of different soil levels b…
## 27 soil_temperature_7_to_28cm Average temperature of different soil levels b…
## 28 soil_temperature_28_to_100cm Average temperature of different soil levels b…
## 29 soil_temperature_100_to_255cm Average temperature of different soil levels b…
## 30 soil_moisture_0_to_7cm Average soil water content as volumetric mixin…
## 31 soil_moisture_7_to_28cm Average soil water content as volumetric mixin…
## 32 soil_moisture_28_to_100cm Average soil water content as volumetric mixin…
## 33 soil_moisture_100_to_255cm Average soil water content as volumetric mixin…
# Create tibble for daily metrics
tblMetricsDaily <- tibble::tibble(metric=dailyMetrics %>% str_split_1(","),
description=dailyDescription %>% str_split_1("\n")
)
tblMetricsDaily
## # A tibble: 16 × 2
## metric description
## <chr> <chr>
## 1 weathercode The most severe weather condition on a given day
## 2 temperature_2m_max Maximum and minimum daily air temperature at 2 me…
## 3 temperature_2m_min Maximum and minimum daily air temperature at 2 me…
## 4 apparent_temperature_max Maximum and minimum daily apparent temperature
## 5 apparent_temperature_min Maximum and minimum daily apparent temperature
## 6 precipitation_sum Sum of daily precipitation (including rain, showe…
## 7 rain_sum Sum of daily rain
## 8 snowfall_sum Sum of daily snowfall
## 9 precipitation_hours The number of hours with rain
## 10 sunrise Sun rise and set times
## 11 sunset Sun rise and set times
## 12 windspeed_10m_max Maximum wind speed and gusts on a day
## 13 windgusts_10m_max Maximum wind speed and gusts on a day
## 14 winddirection_10m_dominant Dominant wind direction
## 15 shortwave_radiation_sum The sum of solar radiaion on a given day in Megaj…
## 16 et0_fao_evapotranspiration Daily sum of ET0 Reference Evapotranspiration of …
Data can then be assembled into a string that is compatible with the Open-Meteo API format:
openMeteoURLCreate <- function(mainURL="https://archive-api.open-meteo.com/v1/archive",
lat=45,
lon=-90,
startDate=paste(year(Sys.Date())-1, "01", "01", sep="-"),
endDate=paste(year(Sys.Date())-1, "12", "31", sep="-"),
hourlyMetrics=NULL,
dailyMetrics=NULL,
tz="GMT",
...
) {
# Create formatted string
fString <- paste0(mainURL,
"?latitude=",
lat,
"&longitude=",
lon,
"&start_date=",
startDate,
"&end_date=",
endDate
)
if(!is.null(hourlyMetrics)) fString <- paste0(fString, "&hourly=", hourlyMetrics)
if(!is.null(dailyMetrics)) fString <- paste0(fString, "&daily=", dailyMetrics)
# Return the formatted string
paste0(fString, "&timezone=", stringr::str_replace(tz, "/", "%2F"), ...)
}
# Blank example
openMeteoURLCreate()
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=45&longitude=-90&start_date=2022-01-01&end_date=2022-12-31&timezone=GMT"
# Matching previous CSV data pull
openMeteoURLCreate(lat=41.85,
lon=-87.65,
startDate="2022-06-01",
endDate="2023-06-08",
hourlyMetrics="temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall",
dailyMetrics="precipitation_sum,rain_sum,snowfall_sum",
tz="America/Chicago"
)
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.85&longitude=-87.65&start_date=2022-06-01&end_date=2023-06-08&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall&daily=precipitation_sum,rain_sum,snowfall_sum&timezone=America%2FChicago"
A helper function is created to convert cities to lat/lon and to allow for selection of hourly and daily metrics by index number:
helperOpenMeteoURL <- function(cityName=NULL,
lat=NULL,
lon=NULL,
hourlyMetrics=NULL,
hourlyIndices=NULL,
hourlyDesc=tblMetricsHourly,
dailyMetrics=NULL,
dailyIndices=NULL,
dailyDesc=tblMetricsDaily,
startDate=NULL,
endDate=NULL,
tz=NULL,
...
) {
# Convert city to lat/lon if lat/lon are NULL
if(is.null(lat) | is.null(lon)) {
if(is.null(cityName)) stop("\nMust provide lat/lon or city name available in maps::us.cities\n")
cityData <- maps::us.cities %>% tibble::as_tibble() %>% filter(name==cityName)
if(nrow(cityData)!=1) stop("\nMust provide city name that maps uniquely to maps::us.cities$name\n")
lat <- cityData$lat[1]
lon <- cityData$long[1]
}
# Get hourly metrics by index if relevant
if(is.null(hourlyMetrics) & !is.null(hourlyIndices)) {
hourlyMetrics <- hourlyDesc %>% slice(hourlyIndices) %>% pull(metric)
hourlyMetrics <- paste0(hourlyMetrics, collapse=",")
cat("\nHourly metrics created from indices:", hourlyMetrics, "\n\n")
}
# Get daily metrics by index if relevant
if(is.null(dailyMetrics) & !is.null(dailyIndices)) {
dailyMetrics <- dailyDesc %>% slice(dailyIndices) %>% pull(metric)
dailyMetrics <- paste0(dailyMetrics, collapse=",")
cat("\nDaily metrics created from indices:", dailyMetrics, "\n\n")
}
# Use default values from OpenMeteoURLCreate() for startDate, endDate, and tz if passed as NULL
if(is.null(startDate)) startDate <- eval(formals(openMeteoURLCreate)$startDate)
if(is.null(endDate)) endDate <- eval(formals(openMeteoURLCreate)$endDate)
if(is.null(tz)) tz <- eval(formals(openMeteoURLCreate)$tz)
# Create and return URL
openMeteoURLCreate(lat=lat,
lon=lon,
startDate=startDate,
endDate=endDate,
hourlyMetrics=hourlyMetrics,
dailyMetrics=dailyMetrics,
tz=tz,
...
)
}
The URL is tested for file download, cached to avoid multiple hits to the server:
testURL <- helperOpenMeteoURL(cityName="Chicago IL",
hourlyIndices=c(1:3, 7:9),
dailyIndices=6:8,
startDate="2022-06-01",
endDate="2023-06-08",
tz="America/Chicago"
)
##
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall
##
##
## Daily metrics created from indices: precipitation_sum,rain_sum,snowfall_sum
testURL
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.84&longitude=-87.68&start_date=2022-06-01&end_date=2023-06-08&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall&daily=precipitation_sum,rain_sum,snowfall_sum&timezone=America%2FChicago"
# Download file
if(!file.exists("notuse_testOM.json")) {
fileDownload(fileName="notuse_testOM.json", url=testURL)
} else {
cat("\nFile notuse_testOM.json already exists, skipping download\n")
}
## size isdir mode mtime ctime
## notuse_testOM.json 426872 FALSE 666 2023-06-19 08:56:49 2023-06-19 08:56:45
## atime exe
## notuse_testOM.json 2023-06-19 08:56:49 no
Code is created to read the JSON return object:
readOpenMeteoJSON <- function(js) {
# FUNCTION arguments:
# js: JSON list returned by download from Open-Meteo
# Get the object and names
jsObj <- jsonlite::read_json(js, simplifyVector = TRUE)
nms <- jsObj %>% names()
cat("\nObjects in JSON include:", paste(nms, collapse=", "), "\n\n")
# Set default objects as NULL
tblDaily <- NULL
tblHourly <- NULL
tblUnitsDaily <- NULL
tblUnitsHourly <- NULL
# Get daily and hourly as tibble if relevant
if("daily" %in% nms) tblDaily <- jsObj$daily %>% tibble::as_tibble()
if("hourly" %in% nms) tblHourly <- jsObj$hourly %>% tibble::as_tibble()
# Helper function for unit conversions
helperMetricUnit <- function(x, mapper, desc) {
x %>%
tibble::as_tibble() %>%
pivot_longer(cols=everything()) %>%
left_join(mapper, by=c("name"="metric")) %>%
mutate(value=stringr::str_replace(value, "\u00b0", "deg ")) %>%
mutate(metricType=desc) %>%
select(metricType, everything())
}
# Get the unit descriptions
if("daily_units" %in% nms)
tblUnitsDaily <- helperMetricUnit(jsObj$daily_units, tblMetricsDaily, desc="daily_units")
if("hourly_units" %in% nms)
tblUnitsHourly <- helperMetricUnit(jsObj$hourly_units, tblMetricsHourly, desc="hourly_units")
if(is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) tblUnits <- tblUnitsHourly
else if(!is.null(tblUnitsDaily) & is.null(tblUnitsHourly)) tblUnits <- tblUnitsDaily
else if(!is.null(tblUnitsDaily) & !is.null(tblUnitsHourly))
tblUnits <- bind_rows(tblUnitsHourly, tblUnitsDaily)
else tblUnits <- NULL
# Put everything else together
tblDescription <- jsObj[setdiff(nms, c("hourly", "hourly_units", "daily", "daily_units"))] %>%
tibble::as_tibble()
# Return the list objects
list(tblDaily=tblDaily, tblHourly=tblHourly, tblUnits=tblUnits, tblDescription=tblDescription)
}
prettyOpenMeteoMeta <- function(df, extr="tblDescription") {
if("list" %in% class(df)) df <- df[[extr]]
for(name in names(df)) {
cat("\n", name, ": ", df %>% pull(name), sep="")
}
cat("\n\n")
}
tmpOM <- readOpenMeteoJSON("notuse_testOM.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly, daily_units, daily
tmpOM
## $tblDaily
## # A tibble: 373 × 4
## time precipitation_sum rain_sum snowfall_sum
## <chr> <dbl> <dbl> <dbl>
## 1 2022-06-01 3.8 3.8 0
## 2 2022-06-02 0 0 0
## 3 2022-06-03 0 0 0
## 4 2022-06-04 1.3 1.3 0
## 5 2022-06-05 0.3 0.3 0
## 6 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2 2 0
## 8 2022-06-08 16 16 0
## 9 2022-06-09 0 0 0
## 10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
##
## $tblHourly
## # A tibble: 8,952 × 7
## time temperature_2m relativehumid…¹ dewpo…² preci…³ rain snowf…⁴
## <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2022-06-01T00:00 21 92 19.7 0.1 0.1 0
## 2 2022-06-01T01:00 20.6 94 19.6 0.3 0.3 0
## 3 2022-06-01T02:00 21.1 93 19.9 0 0 0
## 4 2022-06-01T03:00 20.8 93 19.7 0 0 0
## 5 2022-06-01T04:00 20.5 93 19.3 0 0 0
## 6 2022-06-01T05:00 19.7 95 19 0.7 0.7 0
## 7 2022-06-01T06:00 19.4 96 18.8 1.6 1.6 0
## 8 2022-06-01T07:00 19.2 96 18.5 1 1 0
## 9 2022-06-01T08:00 18.6 90 17 0.1 0.1 0
## 10 2022-06-01T09:00 17.7 84 14.9 0 0 0
## # … with 8,942 more rows, and abbreviated variable names ¹relativehumidity_2m,
## # ²dewpoint_2m, ³precipitation, ⁴snowfall
##
## $tblUnits
## # A tibble: 11 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above g…
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters above…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters a…
## 5 hourly_units precipitation mm Total precipitation (rain, showers,…
## 6 hourly_units rain mm Only liquid precipitation of the pr…
## 7 hourly_units snowfall cm Snowfall amount of the preceding ho…
## 8 daily_units time iso8601 <NA>
## 9 daily_units precipitation_sum mm Sum of daily precipitation (includi…
## 10 daily_units rain_sum mm Sum of daily rain
## 11 daily_units snowfall_sum cm Sum of daily snowfall
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seco…¹ timez…² timez…³ eleva…⁴
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 41.8 -87.7 2.66 -18000 Americ… CDT 180
## # … with abbreviated variable names ¹utc_offset_seconds, ²timezone,
## # ³timezone_abbreviation, ⁴elevation
prettyOpenMeteoMeta(tmpOM)
##
## latitude: 41.8
## longitude: -87.7
## generationtime_ms: 2.658963
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
Conversion functions are written for hourly and daily data:
omProcessDaily <- function(tbl, extr="tblDaily") {
if("list" %in% class(tbl)) tbl <- tbl[[extr]]
tbl %>% mutate(date=lubridate::ymd(time)) %>% select(date, everything())
}
omProcessHourly <- function(tbl, extr="tblHourly") {
if("list" %in% class(tbl)) tbl <- tbl[[extr]]
tbl %>%
mutate(origTime=time,
time=lubridate::ymd_hm(time),
date=lubridate::date(time),
hour=lubridate::hour(time)
) %>%
select(time, date, hour, everything())
}
omProcessDaily(tmpOM)
## # A tibble: 373 × 5
## date time precipitation_sum rain_sum snowfall_sum
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2022-06-01 2022-06-01 3.8 3.8 0
## 2 2022-06-02 2022-06-02 0 0 0
## 3 2022-06-03 2022-06-03 0 0 0
## 4 2022-06-04 2022-06-04 1.3 1.3 0
## 5 2022-06-05 2022-06-05 0.3 0.3 0
## 6 2022-06-06 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2022-06-07 2 2 0
## 8 2022-06-08 2022-06-08 16 16 0
## 9 2022-06-09 2022-06-09 0 0 0
## 10 2022-06-10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
omProcessHourly(tmpOM)
## # A tibble: 8,952 × 10
## time date hour temperat…¹ relat…² dewpo…³ preci…⁴ rain
## <dttm> <date> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2022-06-01 00:00:00 2022-06-01 0 21 92 19.7 0.1 0.1
## 2 2022-06-01 01:00:00 2022-06-01 1 20.6 94 19.6 0.3 0.3
## 3 2022-06-01 02:00:00 2022-06-01 2 21.1 93 19.9 0 0
## 4 2022-06-01 03:00:00 2022-06-01 3 20.8 93 19.7 0 0
## 5 2022-06-01 04:00:00 2022-06-01 4 20.5 93 19.3 0 0
## 6 2022-06-01 05:00:00 2022-06-01 5 19.7 95 19 0.7 0.7
## 7 2022-06-01 06:00:00 2022-06-01 6 19.4 96 18.8 1.6 1.6
## 8 2022-06-01 07:00:00 2022-06-01 7 19.2 96 18.5 1 1
## 9 2022-06-01 08:00:00 2022-06-01 8 18.6 90 17 0.1 0.1
## 10 2022-06-01 09:00:00 2022-06-01 9 17.7 84 14.9 0 0
## # … with 8,942 more rows, 2 more variables: snowfall <dbl>, origTime <chr>, and
## # abbreviated variable names ¹temperature_2m, ²relativehumidity_2m,
## # ³dewpoint_2m, ⁴precipitation
Function readOpenMeteoJSON() is updated to automatically incorporate date conversions:
readOpenMeteoJSON <- function(js, mapDaily=tblMetricsDaily, mapHourly=tblMetricsHourly) {
# FUNCTION arguments:
# js: JSON list returned by download from Open-Meteo
# mapDaily: mapping file for daily metrics
# mapHourly: mapping file for hourly metrics
# Get the object and names
jsObj <- jsonlite::read_json(js, simplifyVector = TRUE)
nms <- jsObj %>% names()
cat("\nObjects in JSON include:", paste(nms, collapse=", "), "\n\n")
# Set default objects as NULL
tblDaily <- NULL
tblHourly <- NULL
tblUnitsDaily <- NULL
tblUnitsHourly <- NULL
# Get daily and hourly as tibble if relevant
if("daily" %in% nms) tblDaily <- jsObj$daily %>% tibble::as_tibble() %>% omProcessDaily()
if("hourly" %in% nms) tblHourly <- jsObj$hourly %>% tibble::as_tibble() %>% omProcessHourly()
# Helper function for unit conversions
helperMetricUnit <- function(x, mapper, desc=NULL) {
if(is.null(desc))
desc <- as.list(match.call())$x %>%
deparse() %>%
stringr::str_replace_all(pattern=".*\\$", replacement="")
x %>%
tibble::as_tibble() %>%
pivot_longer(cols=everything()) %>%
left_join(mapper, by=c("name"="metric")) %>%
mutate(value=stringr::str_replace(value, "\u00b0", "deg ")) %>%
mutate(metricType=desc) %>%
select(metricType, everything())
}
# Get the unit descriptions
if("daily_units" %in% nms) tblUnitsDaily <- helperMetricUnit(jsObj$daily_units, mapDaily)
if("hourly_units" %in% nms) tblUnitsHourly <- helperMetricUnit(jsObj$hourly_units, mapHourly)
if(is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) tblUnits <- tblUnitsHourly
else if(!is.null(tblUnitsDaily) & is.null(tblUnitsHourly)) tblUnits <- tblUnitsDaily
else if(!is.null(tblUnitsDaily) & !is.null(tblUnitsHourly))
tblUnits <- bind_rows(tblUnitsHourly, tblUnitsDaily)
else tblUnits <- NULL
# Put everything else together
tblDescription <- jsObj[setdiff(nms, c("hourly", "hourly_units", "daily", "daily_units"))] %>%
tibble::as_tibble()
# Return the list objects
list(tblDaily=tblDaily, tblHourly=tblHourly, tblUnits=tblUnits, tblDescription=tblDescription)
}
tmpOM2 <- readOpenMeteoJSON("notuse_testOM.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly, daily_units, daily
tmpOM2
## $tblDaily
## # A tibble: 373 × 5
## date time precipitation_sum rain_sum snowfall_sum
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2022-06-01 2022-06-01 3.8 3.8 0
## 2 2022-06-02 2022-06-02 0 0 0
## 3 2022-06-03 2022-06-03 0 0 0
## 4 2022-06-04 2022-06-04 1.3 1.3 0
## 5 2022-06-05 2022-06-05 0.3 0.3 0
## 6 2022-06-06 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2022-06-07 2 2 0
## 8 2022-06-08 2022-06-08 16 16 0
## 9 2022-06-09 2022-06-09 0 0 0
## 10 2022-06-10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
##
## $tblHourly
## # A tibble: 8,952 × 10
## time date hour temperat…¹ relat…² dewpo…³ preci…⁴ rain
## <dttm> <date> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2022-06-01 00:00:00 2022-06-01 0 21 92 19.7 0.1 0.1
## 2 2022-06-01 01:00:00 2022-06-01 1 20.6 94 19.6 0.3 0.3
## 3 2022-06-01 02:00:00 2022-06-01 2 21.1 93 19.9 0 0
## 4 2022-06-01 03:00:00 2022-06-01 3 20.8 93 19.7 0 0
## 5 2022-06-01 04:00:00 2022-06-01 4 20.5 93 19.3 0 0
## 6 2022-06-01 05:00:00 2022-06-01 5 19.7 95 19 0.7 0.7
## 7 2022-06-01 06:00:00 2022-06-01 6 19.4 96 18.8 1.6 1.6
## 8 2022-06-01 07:00:00 2022-06-01 7 19.2 96 18.5 1 1
## 9 2022-06-01 08:00:00 2022-06-01 8 18.6 90 17 0.1 0.1
## 10 2022-06-01 09:00:00 2022-06-01 9 17.7 84 14.9 0 0
## # … with 8,942 more rows, 2 more variables: snowfall <dbl>, origTime <chr>, and
## # abbreviated variable names ¹temperature_2m, ²relativehumidity_2m,
## # ³dewpoint_2m, ⁴precipitation
##
## $tblUnits
## # A tibble: 11 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above g…
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters above…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters a…
## 5 hourly_units precipitation mm Total precipitation (rain, showers,…
## 6 hourly_units rain mm Only liquid precipitation of the pr…
## 7 hourly_units snowfall cm Snowfall amount of the preceding ho…
## 8 daily_units time iso8601 <NA>
## 9 daily_units precipitation_sum mm Sum of daily precipitation (includi…
## 10 daily_units rain_sum mm Sum of daily rain
## 11 daily_units snowfall_sum cm Sum of daily snowfall
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seco…¹ timez…² timez…³ eleva…⁴
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 41.8 -87.7 2.66 -18000 Americ… CDT 180
## # … with abbreviated variable names ¹utc_offset_seconds, ²timezone,
## # ³timezone_abbreviation, ⁴elevation
prettyOpenMeteoMeta(tmpOM2)
##
## latitude: 41.8
## longitude: -87.7
## generationtime_ms: 2.658963
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
identical(tmpOM$tblUnits, tmpOM2$tblUnits)
## [1] TRUE
identical(tmpOM$tblDescription, tmpOM2$tblDescription)
## [1] TRUE
identical(tmpOM$tblDaily %>% omProcessDaily(), tmpOM2$tblDaily)
## [1] TRUE
identical(tmpOM$tblHourly %>% omProcessHourly(), tmpOM2$tblHourly)
## [1] TRUE
The daily data is tested for file download, cached to avoid multiple hits to the server:
testURLDaily <- helperOpenMeteoURL(cityName="Chicago IL",
dailyIndices=1:nrow(tblMetricsDaily),
startDate="2010-01-01",
endDate="2023-06-15",
tz="America/Chicago"
)
##
## Daily metrics created from indices: weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
testURLDaily
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.84&longitude=-87.68&start_date=2010-01-01&end_date=2023-06-15&daily=weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=America%2FChicago"
# Download file
if(!file.exists("notuse_testOM_daily.json")) {
fileDownload(fileName="notuse_testOM_daily.json", url=testURLDaily)
} else {
cat("\nFile notuse_testOM_daily.json already exists, skipping download\n")
}
## size isdir mode mtime
## notuse_testOM_daily.json 573218 FALSE 666 2023-06-23 07:53:04
## ctime atime exe
## notuse_testOM_daily.json 2023-06-23 07:52:59 2023-06-23 07:53:04 no
Data are read and stored as a list:
tmpOMDaily <- readOpenMeteoJSON("notuse_testOM_daily.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily
tmpOMDaily
## $tblDaily
## # A tibble: 4,914 × 18
## date time weath…¹ tempe…² tempe…³ appar…⁴ appar…⁵ preci…⁶ rain_…⁷
## <date> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010-01-01 2010-01-01 3 -8.6 -13.4 -14.9 -19.6 0 0
## 2 2010-01-02 2010-01-02 2 -10.4 -15.1 -17.5 -21.7 0 0
## 3 2010-01-03 2010-01-03 3 -7.9 -13.8 -13.6 -20.1 0 0
## 4 2010-01-04 2010-01-04 3 -6.9 -12.3 -12.8 -18.9 0 0
## 5 2010-01-05 2010-01-05 3 -4.8 -9.8 -10.1 -15.7 0 0
## 6 2010-01-06 2010-01-06 71 -4.9 -9 -9.2 -14.4 0 0
## 7 2010-01-07 2010-01-07 73 -5.2 -8.5 -9.3 -13 7.5 0
## 8 2010-01-08 2010-01-08 73 -3 -9.4 -9.2 -15.3 2.3 0
## 9 2010-01-09 2010-01-09 3 -5.8 -12.3 -10.8 -18.2 0 0
## 10 2010-01-10 2010-01-10 3 -8.8 -19.4 -16.2 -25.6 0 0
## # … with 4,904 more rows, 9 more variables: snowfall_sum <dbl>,
## # precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## # windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## # winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## # et0_fao_evapotranspiration <dbl>, and abbreviated variable names
## # ¹weathercode, ²temperature_2m_max, ³temperature_2m_min,
## # ⁴apparent_temperature_max, ⁵apparent_temperature_min, ⁶precipitation_sum, …
##
## $tblHourly
## NULL
##
## $tblUnits
## # A tibble: 17 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 daily_units time "iso8601" <NA>
## 2 daily_units weathercode "wmo code" The most severe weather co…
## 3 daily_units temperature_2m_max "deg C" Maximum and minimum daily …
## 4 daily_units temperature_2m_min "deg C" Maximum and minimum daily …
## 5 daily_units apparent_temperature_max "deg C" Maximum and minimum daily …
## 6 daily_units apparent_temperature_min "deg C" Maximum and minimum daily …
## 7 daily_units precipitation_sum "mm" Sum of daily precipitation…
## 8 daily_units rain_sum "mm" Sum of daily rain
## 9 daily_units snowfall_sum "cm" Sum of daily snowfall
## 10 daily_units precipitation_hours "h" The number of hours with r…
## 11 daily_units sunrise "iso8601" Sun rise and set times
## 12 daily_units sunset "iso8601" Sun rise and set times
## 13 daily_units windspeed_10m_max "km/h" Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max "km/h" Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg " Dominant wind direction
## 16 daily_units shortwave_radiation_sum "MJ/m²" The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm" Daily sum of ET0 Reference…
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seco…¹ timez…² timez…³ eleva…⁴
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 41.8 -87.7 508. -18000 Americ… CDT 180
## # … with abbreviated variable names ¹utc_offset_seconds, ²timezone,
## # ³timezone_abbreviation, ⁴elevation
prettyOpenMeteoMeta(tmpOMDaily)
##
## latitude: 41.8
## longitude: -87.7
## generationtime_ms: 508.3281
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
# Exploration of precipitation hours by day
tmpOMDaily$tblDaily %>% count(precipitation_hours) %>% print(n=30)
## # A tibble: 25 × 2
## precipitation_hours n
## <dbl> <int>
## 1 0 2499
## 2 1 287
## 3 2 288
## 4 3 231
## 5 4 180
## 6 5 201
## 7 6 152
## 8 7 157
## 9 8 123
## 10 9 121
## 11 10 98
## 12 11 90
## 13 12 74
## 14 13 53
## 15 14 70
## 16 15 41
## 17 16 54
## 18 17 48
## 19 18 36
## 20 19 31
## 21 20 22
## 22 21 11
## 23 22 15
## 24 23 18
## 25 24 14
tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
ggplot(aes(x=precipitation_hours)) +
geom_density(aes(group=lubridate::year(date), color=as.factor(lubridate::year(date)))) +
scale_color_discrete("Year") +
labs(title="Hours of Precipitation per Day", x="Hours of Precipitation", y="Annual density")
tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
ggplot(aes(x=precipitation_hours)) +
geom_histogram(aes(fill=as.factor(lubridate::year(date))), bins=25) +
scale_fill_discrete("Year") +
facet_wrap(~lubridate::year(date)) +
labs(title="Hours of Precipitation per Day", x="Hours of Precipitation", y="# Days")
Precipitation by month is explored:
dfPrecip <- tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
select(date, precipitation_sum, rain_sum, snowfall_sum) %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
yyyymm=customYYYYMM(date)
) %>%
group_by(yyyymm, month) %>%
summarize(across(where(is.numeric), sum), n=n(), .groups="drop")
dfPrecip
## # A tibble: 156 × 6
## yyyymm month precipitation_sum rain_sum snowfall_sum n
## <chr> <fct> <dbl> <dbl> <dbl> <int>
## 1 2010-01 Jan 25.9 12.6 10.8 31
## 2 2010-02 Feb 36.1 0.1 28.6 28
## 3 2010-03 Mar 58 47.7 7.21 31
## 4 2010-04 Apr 100. 100. 0 30
## 5 2010-05 May 154. 154. 0 31
## 6 2010-06 Jun 226. 226. 0 30
## 7 2010-07 Jul 145. 145. 0 31
## 8 2010-08 Aug 66.4 66.4 0 31
## 9 2010-09 Sep 104. 104. 0 30
## 10 2010-10 Oct 60.7 60.7 0 31
## # … with 146 more rows
# Boxplot of precipitation by month
dfPrecip %>%
select(-n) %>%
pivot_longer(-c(yyyymm, month)) %>%
ggplot(aes(x=month, y=ifelse(name=="snowfall_sum", 10*value, value))) +
geom_boxplot(fill="lightblue") +
facet_wrap(~name, scales="free_y") +
labs(title="Precipitation by month (2010-2022)", y="Precipitation (mm)", x=NULL) +
theme(axis.text.x = element_text(angle = 90)) +
lims(y=c(0, NA))
# Mean precipitation by month
dfPrecip %>%
group_by(month) %>%
summarize(across(where(is.numeric), mean)) %>%
ggplot(aes(x=month)) +
geom_col(aes(y=precipitation_sum), fill="green") +
geom_col(aes(y=rain_sum), fill="lightblue") +
geom_text(aes(y=rain_sum/2, label=round(rain_sum))) +
geom_text(aes(y=rain_sum/2 + precipitation_sum/2,
label=ifelse(precipitation_sum>rain_sum+3, round(precipitation_sum-rain_sum), "")
)
) +
geom_text(aes(y=precipitation_sum+5, label=round(precipitation_sum))) +
labs(x=NULL,
y="Precipitation (mm)",
title="Mean precipitation by month (2010-2022)",
subtitle="Light blue is mm falling as rain, green is liquid equivalent of other"
)
Average temperatures by month are also explored:
dfTemp <- tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
select(date,
temperature_2m_max,
temperature_2m_min,
apparent_temperature_max,
apparent_temperature_min
) %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
yyyymm=customYYYYMM(date)
) %>%
group_by(yyyymm, month) %>%
summarize(across(where(is.numeric), mean), n=n(), .groups="drop")
dfTemp
## # A tibble: 156 × 7
## yyyymm month temperature_2m_max temperature_2m_min apparent_…¹ appar…² n
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2010-01 Jan -2.46 -8.18 -7.64 -13.7 31
## 2 2010-02 Feb -0.414 -7.31 -4.85 -12.4 28
## 3 2010-03 Mar 7.67 -0.452 4.25 -5.13 31
## 4 2010-04 Apr 16.9 7.40 14.6 3.56 30
## 5 2010-05 May 20.5 13.1 20.5 11.3 31
## 6 2010-06 Jun 26.0 19.1 28.0 19.5 30
## 7 2010-07 Jul 29.1 21.7 32.0 23.5 31
## 8 2010-08 Aug 28.6 21.5 31.2 23.0 31
## 9 2010-09 Sep 22.7 16.0 22.1 14.8 30
## 10 2010-10 Oct 18.1 10.3 15.4 7.14 31
## # … with 146 more rows, and abbreviated variable names
## # ¹apparent_temperature_max, ²apparent_temperature_min
# Boxplot of precipitation by month
dfTemp %>%
select(-n) %>%
pivot_longer(-c(yyyymm, month)) %>%
ggplot(aes(x=month, y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~name) +
labs(title="Average temperature by month (2010-2022)", y="Average temperature (C)", x=NULL) +
theme(axis.text.x = element_text(angle = 90))
# Mean temperatures by month
dfTemp %>%
select(-n) %>%
group_by(month) %>%
summarize(across(where(is.numeric), mean)) %>%
pivot_longer(cols=-c(month)) %>%
mutate(measType=stringr::str_replace(name, ".*_", ""),
meas=ifelse(str_detect(name, "apparent"), "apparent", "actual")
) %>%
select(-name) %>%
pivot_wider(id_cols=c(month, meas), names_from="measType", values_from="value") %>%
ggplot(aes(x=month)) +
geom_tile(aes(y=(max+min)/2, height=max-min), width=0.5, fill="lightblue") +
geom_text(aes(y=max+1, label=round(max, 1)), size=2.5) +
geom_text(aes(y=min-1, label=round(min, 1)), size=2.5) +
labs(x=NULL,
y="Temperature (C)",
title="Mean high and low temperature by month (2010-2022)",
subtitle="Actual temperature and apparent temperature"
) +
facet_wrap(~meas)
Sunrise and sunset times are explored:
dfSun <- tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
select(date, sunrise, sunset) %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
yyyymm=customYYYYMM(date),
across(c(sunrise, sunset), lubridate::ymd_hm),
sr=hms::as_hms(sunrise),
ss=hms::as_hms(sunset),
doy=lubridate::yday(date),
year=lubridate::year(date)
)
dfSun
## # A tibble: 4,748 × 9
## date sunrise sunset month yyyymm sr ss
## <date> <dttm> <dttm> <fct> <chr> <time> <tim>
## 1 2010-01-01 2010-01-01 08:16:00 2010-01-01 17:32:00 Jan 2010-01 08:16 17:32
## 2 2010-01-02 2010-01-02 08:16:00 2010-01-02 17:33:00 Jan 2010-01 08:16 17:33
## 3 2010-01-03 2010-01-03 08:16:00 2010-01-03 17:34:00 Jan 2010-01 08:16 17:34
## 4 2010-01-04 2010-01-04 08:16:00 2010-01-04 17:34:00 Jan 2010-01 08:16 17:34
## 5 2010-01-05 2010-01-05 08:16:00 2010-01-05 17:35:00 Jan 2010-01 08:16 17:35
## 6 2010-01-06 2010-01-06 08:16:00 2010-01-06 17:36:00 Jan 2010-01 08:16 17:36
## 7 2010-01-07 2010-01-07 08:16:00 2010-01-07 17:37:00 Jan 2010-01 08:16 17:37
## 8 2010-01-08 2010-01-08 08:16:00 2010-01-08 17:38:00 Jan 2010-01 08:16 17:38
## 9 2010-01-09 2010-01-09 08:16:00 2010-01-09 17:39:00 Jan 2010-01 08:16 17:39
## 10 2010-01-10 2010-01-10 08:15:00 2010-01-10 17:40:00 Jan 2010-01 08:15 17:40
## # … with 4,738 more rows, and 2 more variables: doy <dbl>, year <dbl>
# Plot of sunrise and sunset by day of year
dfSun %>%
select(date, month, year, doy, sr, ss) %>%
ggplot(aes(x=doy, group=factor(year), color=factor(year))) +
geom_line(aes(y=sr)) +
geom_line(aes(y=ss)) +
geom_line(aes(y=(ss+sr)/2)) +
labs(x="Day of year", y="Time (always on DST)", title="Sunrise, sunset, and solar noon by day of year") +
scale_color_discrete("Year")
# Plot of minutes gained from earliest/latest
dfSun %>%
select(date, month, year, doy, sr, ss) %>%
group_by(year) %>%
mutate(dsr=max(sr)-sr, dss=ss-min(ss)) %>%
ungroup() %>%
rename(sunrise_change=dsr, sunset_change=dss) %>%
pivot_longer(cols=c(sunrise_change, sunset_change)) %>%
ggplot(aes(x=doy)) +
geom_point(aes(y=as.numeric(value)/60, color=name), size=0.5) +
labs(x="Day of year", y="Minutes", title="Delta from latest sunrise / earliest sunset") +
scale_color_discrete("Metric")
Wind data is explored:
dfWind <- tmpOMDaily$tblDaily %>%
select(date,
dir=winddirection_10m_dominant,
spd=windspeed_10m_max,
gst=windgusts_10m_max
) %>%
mutate(month=lubridate::month(date),
year=lubridate::year(date),
dir10=round(dir/10)*10,
spd5=round(spd/5)*5,
gst5=round(gst/5)*5
)
dfWind
## # A tibble: 4,914 × 9
## date dir spd gst month year dir10 spd5 gst5
## <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010-01-01 291 23.4 38.2 1 2010 290 25 40
## 2 2010-01-02 309 24.3 40.3 1 2010 310 25 40
## 3 2010-01-03 313 21.6 35.6 1 2010 310 20 35
## 4 2010-01-04 304 21 35.3 1 2010 300 20 35
## 5 2010-01-05 298 19.8 33.1 1 2010 300 20 35
## 6 2010-01-06 280 16.4 27 1 2010 280 15 25
## 7 2010-01-07 240 16.3 29.5 1 2010 240 15 30
## 8 2010-01-08 334 27.6 45 1 2010 330 30 45
## 9 2010-01-09 305 17.2 28.1 1 2010 300 15 30
## 10 2010-01-10 226 27.9 46.4 1 2010 230 30 45
## # … with 4,904 more rows
# Plot of wind direction and speed
dfWind %>%
ggplot(aes(x=dir, y=spd)) +
geom_point(alpha=0.2, size=0.5) +
coord_polar() +
facet_wrap(~factor(month.abb[month], levels=month.abb), nrow=2) +
geom_vline(xintercept=c(0, 90, 180, 270), lty=2, color="red") +
labs(title="Maximum wind speed and predominant direction (measured daily)",
y="Maximum Wind speed (km/h)",
x="Predominant Wind direction"
) +
scale_x_continuous(breaks=c(0, 90, 180, 270))
dfWind %>%
filter(lubridate::year(date)<=2022) %>%
count(month, dir10, spd5) %>%
ggplot(aes(x=dir10, y=spd5)) +
geom_point(aes(size=n), alpha=0.2) +
coord_polar() +
facet_wrap(~factor(month.abb[month], levels=month.abb)) +
geom_vline(xintercept=c(0, 90, 180, 270), lty=2, color="red") +
labs(title="Maximum wind speed and predominant direction (measured daily)",
subtitle="Wind speed rounded to nearest 5 km/h, wind direction rounded to nearest 10 degrees",
y="Maximum Wind speed (km/h)",
x="Predominant Wind direction"
) +
scale_x_continuous(breaks=c(0, 90, 180, 270))
# Plot of predominant wind direction
dfWind %>%
ggplot(aes(x=dir10)) +
geom_histogram(binwidth=10) +
facet_wrap(~factor(month.abb[month], levels=month.abb)) +
geom_vline(xintercept=c(0, 90, 180, 270, 360), lty=2, color="red") +
labs(title="Predominant wind direction (measured daily)",
y="# Days",
x="Predominant wind direction (rounded to nearest 10 degrees)"
) +
scale_x_continuous(breaks=c(0, 90, 180, 270, 360))
# Plot of maximum wind speed
dfWind %>%
ggplot(aes(x=spd5)) +
geom_histogram(binwidth=5) +
facet_wrap(~factor(month.abb[month], levels=month.abb)) +
labs(title="Maximum wind speed (measured daily)",
y="# Days",
x="Maximum wind speed (km/h, rounded to nearest 5 km/h)"
)
# Mean maximum wind speed by month
dfWind %>%
filter(year<=2022) %>%
select(date, month, year, spd, gst) %>%
pivot_longer(cols=-c(date, month, year)) %>%
ggplot(aes(x=factor(month.abb[month], levels=month.abb), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~c("gst"="2. Maximum wind gust", "spd"="1. Maximum wind speed")[name]) +
labs(title="Wind speed measured daily (2010-2022)", y="Wind speed (km/h)", x=NULL) +
theme(axis.text.x = element_text(angle = 90)) +
lims(y=c(0, NA))
Weather codes, radiation, and evapotranspiration are explored:
dfOther <- tmpOMDaily$tblDaily %>%
select(date, wc=weathercode, sw=shortwave_radiation_sum, et=et0_fao_evapotranspiration) %>%
mutate(wc=factor(wc, levels=sort(unique(wc))),
year=lubridate::year(date),
month=factor(month.abb[lubridate::month(date)], levels=month.abb),
yyyymm=customYYYYMM(date)
)
dfOther
## # A tibble: 4,914 × 7
## date wc sw et year month yyyymm
## <date> <fct> <dbl> <dbl> <dbl> <fct> <chr>
## 1 2010-01-01 3 6.94 0.53 2010 Jan 2010-01
## 2 2010-01-02 2 7.91 0.49 2010 Jan 2010-01
## 3 2010-01-03 3 5.62 0.46 2010 Jan 2010-01
## 4 2010-01-04 3 5.09 0.48 2010 Jan 2010-01
## 5 2010-01-05 3 6.61 0.52 2010 Jan 2010-01
## 6 2010-01-06 71 7.47 0.48 2010 Jan 2010-01
## 7 2010-01-07 73 3.82 0.29 2010 Jan 2010-01
## 8 2010-01-08 73 6.47 0.53 2010 Jan 2010-01
## 9 2010-01-09 3 6.22 0.38 2010 Jan 2010-01
## 10 2010-01-10 3 8.99 0.35 2010 Jan 2010-01
## # … with 4,904 more rows
# Histogram of weather code
dfOther %>%
filter(year<=2022) %>%
ggplot(aes(x=wc)) +
geom_bar() +
facet_wrap(~month) +
labs(title="Weather codes by month (2010-2022)", y="Count", x="Weather code") +
theme(axis.text.x = element_text(angle = 90))
# Mean radiation and evapotranspiration by month
dfOther %>%
select(-year) %>%
group_by(month) %>%
summarize(across(where(is.numeric), mean)) %>%
pivot_longer(cols=-c(month)) %>%
ggplot(aes(x=month)) +
geom_point(aes(y=value)) +
geom_line(aes(y=value, group=1)) +
labs(x=NULL,
y=NULL,
title="Mean radiation and evapotranspiration by month (2010-2022)",
subtitle="Evapotranspiration (mm) and Radiation (MegaJoules)"
) +
facet_wrap(~c("et"="Evapotranspiration (mm)", "sw"="Radiation (MJ)")[name], scales="free_y") +
lims(y=c(0, NA))
# Boxplot for radiation and evapotranspiration by month
dfOther %>%
select(date, month, sw, et) %>%
pivot_longer(-c(date, month)) %>%
ggplot(aes(x=month)) +
geom_boxplot(aes(y=value), fill="lightblue") +
labs(x=NULL,
y=NULL,
title="Daily radiation and evapotranspiration (2010-2022)",
subtitle="Evapotranspiration (mm) and Radiation (MegaJoules)"
) +
facet_wrap(~c("et"="Evapotranspiration (mm)", "sw"="Radiation (MJ)")[name], scales="free_y") +
lims(y=c(0, NA))
The hourly data is tested for file download, cached to avoid multiple hits to the server:
testURLHourly <- helperOpenMeteoURL(cityName="Chicago IL",
hourlyIndices=1:nrow(tblMetricsHourly),
startDate="2010-01-01",
endDate="2023-06-15",
tz="America/Chicago"
)
##
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm
testURLHourly
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.84&longitude=-87.68&start_date=2010-01-01&end_date=2023-06-15&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&timezone=America%2FChicago"
# Download file
if(!file.exists("notuse_testOM_hourly.json")) {
fileDownload(fileName="notuse_testOM_hourly.json", url=testURLHourly)
} else {
cat("\nFile notuse_testOM_hourly.json already exists, skipping download\n")
}
## size isdir mode mtime
## notuse_testOM_hourly.json 20178300 FALSE 666 2023-06-30 08:03:13
## ctime atime exe
## notuse_testOM_hourly.json 2023-06-30 08:02:50 2023-06-30 08:03:13 no
Data are read and stored as a list:
tmpOMHourly <- readOpenMeteoJSON("notuse_testOM_hourly.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly
tmpOMHourly
## $tblDaily
## NULL
##
## $tblHourly
## # A tibble: 117,936 × 37
## time date hour temper…¹ relat…² dewpo…³ appar…⁴ press…⁵
## <dttm> <date> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 0 -9.5 67 -14.4 -15.8 1024.
## 2 2010-01-01 01:00:00 2010-01-01 1 -9.8 69 -14.4 -16.3 1025.
## 3 2010-01-01 02:00:00 2010-01-01 2 -10.3 73 -14.2 -16.8 1025.
## 4 2010-01-01 03:00:00 2010-01-01 3 -10.8 74 -14.5 -17.2 1026.
## 5 2010-01-01 04:00:00 2010-01-01 4 -11.3 75 -14.8 -17.7 1026.
## 6 2010-01-01 05:00:00 2010-01-01 5 -11.8 76 -15.1 -18.2 1026.
## 7 2010-01-01 06:00:00 2010-01-01 6 -12.3 77 -15.5 -18.6 1027.
## 8 2010-01-01 07:00:00 2010-01-01 7 -12.8 78 -15.8 -19 1028.
## 9 2010-01-01 08:00:00 2010-01-01 8 -13.2 79 -16.1 -19.4 1028.
## 10 2010-01-01 09:00:00 2010-01-01 9 -13.4 78 -16.3 -19.6 1028.
## # … with 117,926 more rows, 29 more variables: surface_pressure <dbl>,
## # precipitation <dbl>, rain <dbl>, snowfall <dbl>, cloudcover <int>,
## # cloudcover_low <int>, cloudcover_mid <int>, cloudcover_high <int>,
## # shortwave_radiation <dbl>, direct_radiation <dbl>,
## # direct_normal_irradiance <dbl>, diffuse_radiation <dbl>,
## # windspeed_10m <dbl>, windspeed_100m <dbl>, winddirection_10m <int>,
## # winddirection_100m <int>, windgusts_10m <dbl>, …
##
## $tblUnits
## # A tibble: 34 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above …
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters abov…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters …
## 5 hourly_units apparent_temperature deg C Apparent temperature is the percei…
## 6 hourly_units pressure_msl hPa Atmospheric air pressure reduced t…
## 7 hourly_units surface_pressure hPa Atmospheric air pressure reduced t…
## 8 hourly_units precipitation mm Total precipitation (rain, showers…
## 9 hourly_units rain mm Only liquid precipitation of the p…
## 10 hourly_units snowfall cm Snowfall amount of the preceding h…
## # … with 24 more rows
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seco…¹ timez…² timez…³ eleva…⁴
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 41.8 -87.7 6370. -18000 Americ… CDT 180
## # … with abbreviated variable names ¹utc_offset_seconds, ²timezone,
## # ³timezone_abbreviation, ⁴elevation
prettyOpenMeteoMeta(tmpOMHourly)
##
## latitude: 41.8
## longitude: -87.7
## generationtime_ms: 6369.988
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
Consistency of data between daily and hourly is explored:
# Variables where maximum of hourly should be created
vrblMax <- c("weathercode", "temperature_2m", "apparent_temperature", "windspeed_10m", "windgusts_10m")
# Variables where minimum of hourly should be created
vrblMin <- c("temperature_2m", "apparent_temperature")
# Variables where sum of hourly should be created
vrblSum <- c("precipitation", "rain", "snowfall", "shortwave_radiation", "et0_fao_evapotranspiration")
# Variables in daily not to explore
# date, time, sunrise, sunset
# Variables that require a different approach
# winddirection_10m_dominant, precipitation_hours
# Check that all variables are included in hourly data
c(vrblMax, vrblMin, vrblSum) %in% (tmpOMHourly$tblHourly %>% names)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Create daily data from hourly
dfDailyFromHourly <- tmpOMHourly$tblHourly %>%
group_by(date) %>%
summarize(across(.cols=all_of(vrblMax), .fns=max, .names="{.col}_max"),
across(.cols=all_of(vrblMin), .fns=min, .names="{.col}_min"),
across(.cols=all_of(vrblSum), .fns=sum, .names="{.col}_sum"),
precipitation_hours=sum(precipitation>0)
) %>%
rename(weathercode=weathercode_max, et0_fao_evapotranspiration=et0_fao_evapotranspiration_sum)
dfDailyFromHourly
## # A tibble: 4,914 × 14
## date weatherc…¹ tempe…² appar…³ winds…⁴ windg…⁵ tempe…⁶ appar…⁷ preci…⁸
## <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010-01-01 3 -8.6 -14.9 23.4 38.2 -13.4 -19.6 0
## 2 2010-01-02 2 -10.4 -17.5 24.3 40.3 -15.1 -21.7 0
## 3 2010-01-03 3 -7.9 -13.6 21.6 35.6 -13.8 -20.1 0
## 4 2010-01-04 3 -6.9 -12.8 21 35.3 -12.3 -18.9 0
## 5 2010-01-05 3 -4.8 -10.1 19.8 33.1 -9.8 -15.7 0
## 6 2010-01-06 71 -4.9 -9.2 16.4 27 -9 -14.4 0
## 7 2010-01-07 73 -5.2 -9.3 16.3 29.5 -8.5 -13 7.5
## 8 2010-01-08 73 -3 -9.2 27.6 45 -9.4 -15.3 2.3
## 9 2010-01-09 3 -5.8 -10.8 17.2 28.1 -12.3 -18.2 0
## 10 2010-01-10 3 -8.8 -16.2 27.9 46.4 -19.4 -25.6 0
## # … with 4,904 more rows, 5 more variables: rain_sum <dbl>, snowfall_sum <dbl>,
## # shortwave_radiation_sum <dbl>, et0_fao_evapotranspiration <dbl>,
## # precipitation_hours <int>, and abbreviated variable names ¹weathercode,
## # ²temperature_2m_max, ³apparent_temperature_max, ⁴windspeed_10m_max,
## # ⁵windgusts_10m_max, ⁶temperature_2m_min, ⁷apparent_temperature_min,
## # ⁸precipitation_sum
names(dfDailyFromHourly)
## [1] "date" "weathercode"
## [3] "temperature_2m_max" "apparent_temperature_max"
## [5] "windspeed_10m_max" "windgusts_10m_max"
## [7] "temperature_2m_min" "apparent_temperature_min"
## [9] "precipitation_sum" "rain_sum"
## [11] "snowfall_sum" "shortwave_radiation_sum"
## [13] "et0_fao_evapotranspiration" "precipitation_hours"
names(dfDailyFromHourly) %in% names(tmpOMDaily$tblDaily)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Check data consistency
for (colName in names(dfDailyFromHourly)) {
cat("\n",
colName,
":",
all.equal(dfDailyFromHourly %>% pull(colName), tmpOMDaily$tblDaily %>% pull(colName))
)
}
##
## date : TRUE
## weathercode : TRUE
## temperature_2m_max : TRUE
## apparent_temperature_max : TRUE
## windspeed_10m_max : TRUE
## windgusts_10m_max : TRUE
## temperature_2m_min : TRUE
## apparent_temperature_min : TRUE
## precipitation_sum : TRUE
## rain_sum : TRUE
## snowfall_sum : TRUE
## shortwave_radiation_sum : Mean relative difference: 0.9964
## et0_fao_evapotranspiration : TRUE
## precipitation_hours : TRUE
# Plot for differences in radiation
dfRadiation <- dfDailyFromHourly %>%
select(date, shortwave_radiation_sum) %>%
bind_rows(select(tmpOMDaily$tblDaily, date, shortwave_radiation_sum), .id="src") %>%
mutate(src=c("1"="Daily from Hourly", "2"="Daily as Reported")[src])
dfRadiation %>%
ggplot(aes(x=date, y=shortwave_radiation_sum)) +
geom_line(aes(group=src, color=src)) +
labs(x=NULL, y="Sum of radiation", title="Comparison of shortwave radiation by day by source")
# Exploration of units
tmpOMDaily$tblUnits %>% filter(name=="shortwave_radiation_sum")
## # A tibble: 1 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 daily_units shortwave_radiation_sum MJ/m² The sum of solar radiaion on a give…
tmpOMHourly$tblUnits %>% filter(name=="shortwave_radiation")
## # A tibble: 1 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units shortwave_radiation W/m² Shortwave solar radiation as average o…
# Conversion of Watts per hour to MegaJoules
# 0.0036 megajoules/watt-hour
dfRadiation %>%
ggplot(aes(x=date, y=ifelse(src=="Daily from Hourly", 0.0036, 1)*shortwave_radiation_sum)) +
geom_line(aes(group=src, color=src)) +
labs(x=NULL,
y="Sum of radiation",
title="Comparison of shortwave radiation by day by source",
subtitle="Summed from hourly multiplied by 0.0036 to convert Watt-hours to MegaJoules"
)
dfRadiation %>%
pivot_wider(id_cols="date", names_from="src", values_from="shortwave_radiation_sum") %>%
mutate(rat=`Daily as Reported`/`Daily from Hourly`) %>%
summary()
## date Daily from Hourly Daily as Reported rat
## Min. :2010-01-01 Min. : 135 Min. : 0.49 Min. :0.003585
## 1st Qu.:2013-05-13 1st Qu.:2304 1st Qu.: 8.29 1st Qu.:0.003599
## Median :2016-09-22 Median :4062 Median :14.62 Median :0.003600
## Mean :2016-09-22 Mean :4190 Mean :15.08 Mean :0.003600
## 3rd Qu.:2020-02-02 3rd Qu.:6056 3rd Qu.:21.80 3rd Qu.:0.003601
## Max. :2023-06-15 Max. :8788 Max. :31.64 Max. :0.003630
With the exception of radiation (reported in different units causing slight rounding differences), the reported daily data matches the expected aggregate of the reported hourly data
Precipitation by hour is explored:
# Hourly precipitation data
dfHourlyPrecip <- tmpOMHourly$tblHourly %>%
select(time, hour, precipitation, snowfall, rain) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyPrecip
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan precipitation 0
## 2 2010-01-01 00:00:00 0 2010 Jan snowfall 0
## 3 2010-01-01 00:00:00 0 2010 Jan rain 0
## 4 2010-01-01 01:00:00 1 2010 Jan precipitation 0
## 5 2010-01-01 01:00:00 1 2010 Jan snowfall 0
## 6 2010-01-01 01:00:00 1 2010 Jan rain 0
## 7 2010-01-01 02:00:00 2 2010 Jan precipitation 0
## 8 2010-01-01 02:00:00 2 2010 Jan snowfall 0
## 9 2010-01-01 02:00:00 2 2010 Jan rain 0
## 10 2010-01-01 03:00:00 3 2010 Jan precipitation 0
## # … with 353,798 more rows
# Nil precipitation percent
dfNilPrecip <- dfHourlyPrecip %>%
group_by(month, name) %>%
summarize(pctNil=mean(value==0), .groups="drop")
dfNilPrecip
## # A tibble: 36 × 3
## month name pctNil
## <fct> <chr> <dbl>
## 1 Jan precipitation 0.845
## 2 Jan rain 0.925
## 3 Jan snowfall 0.877
## 4 Feb precipitation 0.845
## 5 Feb rain 0.938
## 6 Feb snowfall 0.865
## 7 Mar precipitation 0.851
## 8 Mar rain 0.884
## 9 Mar snowfall 0.946
## 10 Apr precipitation 0.807
## # … with 26 more rows
# Graphs of precipitation amount by month
for(metric in unique(dfHourlyPrecip$name)) {
p1 <- dfHourlyPrecip %>%
filter(name==metric, value>0, year<=2022) %>%
ggplot() +
geom_histogram(aes(x=value), bins = 50) +
facet_wrap(~month) +
labs(x=NULL,
y=NULL,
title=paste0(metric,
": hourly total (",
tmpOMHourly$tblUnits %>% filter(name==metric) %>% pull(value),
") from 2010-2022"
)
) +
geom_text(data=dfNilPrecip %>% filter(name==metric),
aes(x=Inf,
y=Inf,
label=paste0("Excludes ", round(100*pctNil, 1), "%\nof observations at 0")
),
size=2.5,
hjust=1,
vjust=1
)
print(p1)
}
Temperature by hour is explored:
# Hourly temperature data
dfHourlyTemp <- tmpOMHourly$tblHourly %>%
select(time, hour, temperature_2m, apparent_temperature, dewpoint_2m) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyTemp
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan temperature_2m -9.5
## 2 2010-01-01 00:00:00 0 2010 Jan apparent_temperature -15.8
## 3 2010-01-01 00:00:00 0 2010 Jan dewpoint_2m -14.4
## 4 2010-01-01 01:00:00 1 2010 Jan temperature_2m -9.8
## 5 2010-01-01 01:00:00 1 2010 Jan apparent_temperature -16.3
## 6 2010-01-01 01:00:00 1 2010 Jan dewpoint_2m -14.4
## 7 2010-01-01 02:00:00 2 2010 Jan temperature_2m -10.3
## 8 2010-01-01 02:00:00 2 2010 Jan apparent_temperature -16.8
## 9 2010-01-01 02:00:00 2 2010 Jan dewpoint_2m -14.2
## 10 2010-01-01 03:00:00 3 2010 Jan temperature_2m -10.8
## # … with 353,798 more rows
# Graphs of precipitation amount by month
for(metric in unique(dfHourlyTemp$name)) {
p1 <- dfHourlyTemp %>%
filter(name==metric, year<=2022) %>%
ggplot() +
geom_boxplot(aes(x=factor(hour), y=value), fill = "lightblue") +
facet_wrap(~month) +
labs(x=NULL,
y=NULL,
title=paste0(metric,
": hourly boxplot (",
tmpOMHourly$tblUnits %>% filter(name==metric) %>% pull(value),
") from 2010-2022"
)
)
print(p1)
}
# Spread of temperature by day
dfHourlyTemp %>%
mutate(date=lubridate::date(time)) %>%
group_by(year, month, date, name) %>%
summarize(maxValue=max(value), minValue=min(value), mdnValue=median(value), .groups="drop") %>%
mutate(spd=maxValue-minValue) %>%
group_by(month, name) %>%
summarize(across(where(is.numeric), mean), .groups="drop") %>%
ggplot(aes(x=fct_rev(month), y=spd)) +
geom_point() +
coord_flip() +
facet_wrap(~name) +
labs(title="Average high/low spread of key metrics by month (deg C)", x=NULL, y="deg C") +
lims(y=c(0, NA))
Hours with maximum/minimum temperature and precipitation are explored:
# Create temperature and precipitation data
dfHourlyTempPrecip <- dfHourlyTemp %>%
bind_rows(dfHourlyPrecip) %>%
mutate(date=lubridate::date(time)) %>%
arrange(time, name)
dfHourlyTempPrecip
## # A tibble: 707,616 × 7
## time hour year month name value date
## <dttm> <int> <dbl> <fct> <chr> <dbl> <date>
## 1 2010-01-01 00:00:00 0 2010 Jan apparent_temperature -15.8 2010-01-01
## 2 2010-01-01 00:00:00 0 2010 Jan dewpoint_2m -14.4 2010-01-01
## 3 2010-01-01 00:00:00 0 2010 Jan precipitation 0 2010-01-01
## 4 2010-01-01 00:00:00 0 2010 Jan rain 0 2010-01-01
## 5 2010-01-01 00:00:00 0 2010 Jan snowfall 0 2010-01-01
## 6 2010-01-01 00:00:00 0 2010 Jan temperature_2m -9.5 2010-01-01
## 7 2010-01-01 01:00:00 1 2010 Jan apparent_temperature -16.3 2010-01-01
## 8 2010-01-01 01:00:00 1 2010 Jan dewpoint_2m -14.4 2010-01-01
## 9 2010-01-01 01:00:00 1 2010 Jan precipitation 0 2010-01-01
## 10 2010-01-01 01:00:00 1 2010 Jan rain 0 2010-01-01
## # … with 707,606 more rows
# Limit to temperature, dewpoint, and precipitation
# Limit precipitation to only days with precipitation > 0
tmpDF <- dfHourlyTempPrecip %>%
filter(name %in% c("dewpoint_2m", "precipitation", "temperature_2m")) %>%
group_by(date, name) %>%
filter(name!="precipitation" | sum(value)>0) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDF
## # A tibble: 1,728 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 dewpoint_2m Jan 0 isMax 0.212
## 2 dewpoint_2m Jan 0 isMin 0.196
## 3 dewpoint_2m Jan 1 isMax 0.0507
## 4 dewpoint_2m Jan 1 isMin 0.0968
## 5 dewpoint_2m Jan 2 isMax 0.0599
## 6 dewpoint_2m Jan 2 isMin 0.0484
## 7 dewpoint_2m Jan 3 isMax 0.0346
## 8 dewpoint_2m Jan 3 isMin 0.0253
## 9 dewpoint_2m Jan 4 isMax 0.0184
## 10 dewpoint_2m Jan 4 isMin 0.0507
## # … with 1,718 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDF$name)) {
p1 <- tmpDF %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value",
ifelse(keyMetric=="precipitation", " (days with no precipitation excluded)", "")
)
) +
scale_color_discrete("Metric:")
print(p1)
}
# Plot percent of hours with precipitation
dfHourlyTempPrecip %>%
filter(name=="precipitation") %>%
group_by(month, hour) %>%
summarize(pct0=mean(value>0), pct05=mean(value>=0.5), .groups="drop") %>%
pivot_longer(-c(month, hour)) %>%
mutate(name=ifelse(name=="pct0", ">=0.1 mm", ">=0.5 mm")) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(group=name, color=name)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% at/above precipitation hurdle",
title=paste0("% of observations with precipitation in past hour")
) +
lims(y=c(0, NA))
Wind by hour is explored:
# Hourly wind data
dfHourlyWind <- tmpOMHourly$tblHourly %>%
select(time, hour, windspeed_10m, windgusts_10m, winddirection_10m) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyWind
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan windspeed_10m 18.7
## 2 2010-01-01 00:00:00 0 2010 Jan windgusts_10m 33.8
## 3 2010-01-01 00:00:00 0 2010 Jan winddirection_10m 298
## 4 2010-01-01 01:00:00 1 2010 Jan windspeed_10m 20.1
## 5 2010-01-01 01:00:00 1 2010 Jan windgusts_10m 32.4
## 6 2010-01-01 01:00:00 1 2010 Jan winddirection_10m 291
## 7 2010-01-01 02:00:00 2 2010 Jan windspeed_10m 19.9
## 8 2010-01-01 02:00:00 2 2010 Jan windgusts_10m 34.2
## 9 2010-01-01 02:00:00 2 2010 Jan winddirection_10m 290
## 10 2010-01-01 03:00:00 3 2010 Jan windspeed_10m 19.5
## # … with 353,798 more rows
# Graphs of wind speed/gust by month
for(metric in setdiff(unique(dfHourlyWind$name), "winddirection_10m")) {
p1 <- dfHourlyWind %>%
filter(name==metric, year<=2022) %>%
ggplot() +
geom_boxplot(aes(x=factor(hour), y=value), fill = "lightblue") +
facet_wrap(~month) +
labs(x=NULL,
y=NULL,
title=paste0(metric,
": hourly boxplot (",
tmpOMHourly$tblUnits %>% filter(name==metric) %>% pull(value),
") from 2010-2022"
)
)
print(p1)
}
# Average wind speed and gust by hour
dfHourlyWind %>%
filter(!(name %in% c("winddirection_10m")), year<=2022) %>%
group_by(month, hour, name) %>%
summarize(across("value", mean), .groups="drop") %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_point(aes(group=name, color=name)) +
facet_wrap(~month) +
labs(title="Average wind speed and gust (km/h)", x=NULL, y="km/h") +
lims(y=c(0, NA))
# Average wind direction by hour
dfHourlyWind %>%
filter((name %in% c("winddirection_10m")), year<=2022) %>%
mutate(preDom=case_when(value<45|value>=315~"N",
value<135~"E",
value<225~"S",
value<315~"W",
TRUE~"error"
)
) %>%
count(month, hour, preDom) %>%
ggplot(aes(x=factor(hour), y=n)) +
geom_col(aes(fill=factor(preDom, levels=c("N", "W", "S", "E"))), position="fill") +
facet_wrap(~month) +
labs(title="Distribution of wind direction", x=NULL, y="Wind direction (%)") +
scale_fill_discrete("")
Wind by N/S and E/W is also explored:
# Average wind direction by hour
tmpWindDir <- dfHourlyWind %>%
filter((name %in% c("winddirection_10m")), year<=2022) %>%
mutate(ew=case_when(value>30&value<150~"E",
value>210&value<=330~"W",
TRUE~"none"
),
ns=case_when(value>300|value<=60~"N",
value>120&value<=240~"S",
TRUE~"none"
)
)
tmpWindDir
## # A tibble: 113,952 × 8
## time hour year month name value ew ns
## <dttm> <int> <dbl> <fct> <chr> <dbl> <chr> <chr>
## 1 2010-01-01 00:00:00 0 2010 Jan winddirection_10m 298 W none
## 2 2010-01-01 01:00:00 1 2010 Jan winddirection_10m 291 W none
## 3 2010-01-01 02:00:00 2 2010 Jan winddirection_10m 290 W none
## 4 2010-01-01 03:00:00 3 2010 Jan winddirection_10m 289 W none
## 5 2010-01-01 04:00:00 4 2010 Jan winddirection_10m 289 W none
## 6 2010-01-01 05:00:00 5 2010 Jan winddirection_10m 288 W none
## 7 2010-01-01 06:00:00 6 2010 Jan winddirection_10m 287 W none
## 8 2010-01-01 07:00:00 7 2010 Jan winddirection_10m 286 W none
## 9 2010-01-01 08:00:00 8 2010 Jan winddirection_10m 285 W none
## 10 2010-01-01 09:00:00 9 2010 Jan winddirection_10m 282 W none
## # … with 113,942 more rows
tmpWindDir %>%
count(month, hour, ew) %>%
group_by(month, hour) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=hour, y=pct)) +
geom_line(aes(color=factor(ew, levels=c("W", "none", "E")))) +
facet_wrap(~month) +
labs(title="Wind direction",
x="Hour of day",
y="% of observations",
subtitle="(030-150 deg defined as East, 210-330 deg defined as West)"
) +
scale_color_discrete("") +
lims(y=c(0, NA))
tmpWindDir %>%
count(month, hour, ns) %>%
group_by(month, hour) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=hour, y=pct)) +
geom_line(aes(color=factor(ns, levels=c("S", "none", "N")))) +
facet_wrap(~month) +
labs(title="Wind direction",
x="Hour of day",
y="% of observations",
subtitle="(300-060 deg defined as North, 120-240 deg defined as South)"
) +
scale_color_discrete("") +
lims(y=c(0, NA))
Hourly wind directions are averaged using arctan. The formula is arctan2(y=sum-of-sin, x=sum-of-cos):
# Unweighted by wind speed
tmpWindatan_uw <- tmpWindDir %>%
mutate(date=lubridate::date(time), cosine=cos(2*pi*value/360), sine=sin(2*pi*value/360)) %>%
group_by(date) %>%
summarize(across(c(cosine, sine), sum), .groups="drop") %>%
mutate(arctangent=atan(sine/cosine),
arctangent2=atan2(y=sine, x=cosine),
avgdir=((arctangent2/2)*(360/pi)),
avgdir=ifelse(avgdir<0, 360+avgdir, avgdir)
) %>%
left_join(select(tmpOMDaily$tblDaily, date, wdd=winddirection_10m_dominant), by="date")
tmpWindatan_uw
## # A tibble: 4,748 × 7
## date cosine sine arctangent arctangent2 avgdir wdd
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2010-01-01 8.00 -22.2 -1.22 -1.22 290. 291
## 2 2010-01-02 14.8 -18.2 -0.888 -0.888 309. 309
## 3 2010-01-03 16.2 -17.3 -0.817 -0.817 313. 313
## 4 2010-01-04 13.4 -19.8 -0.976 -0.976 304. 304
## 5 2010-01-05 10.9 -21.3 -1.10 -1.10 297. 298
## 6 2010-01-06 1.50 -22.4 -1.50 -1.50 274. 280
## 7 2010-01-07 -9.62 -6.35 0.583 -2.56 213. 240
## 8 2010-01-08 19.6 -11.1 -0.516 -0.516 330. 334
## 9 2010-01-09 13.5 -19.3 -0.962 -0.962 305. 305
## 10 2010-01-10 -13.7 -17.3 0.900 -2.24 232. 226
## # … with 4,738 more rows
tmpWindatan_uw %>%
mutate(delta=avgdir-wdd) %>%
summary()
## date cosine sine arctangent
## Min. :2010-01-01 Min. :-23.894 Min. :-23.930 Min. :-1.5699
## 1st Qu.:2013-04-01 1st Qu.:-14.778 1st Qu.:-14.142 1st Qu.:-0.5057
## Median :2016-07-01 Median : -2.812 Median : -3.161 Median : 0.2924
## Mean :2016-07-01 Mean : -1.955 Mean : -2.518 Mean : 0.1595
## 3rd Qu.:2019-10-01 3rd Qu.: 10.799 3rd Qu.: 8.453 3rd Qu.: 0.8423
## Max. :2022-12-31 Max. : 23.900 Max. : 23.627 Max. : 1.5704
## arctangent2 avgdir wdd delta
## Min. :-3.1408 Min. : 0.1209 Min. : 0.0 Min. :-356.6866
## 1st Qu.:-2.1208 1st Qu.: 92.7324 1st Qu.: 93.0 1st Qu.: -2.5483
## Median :-0.6638 Median :196.5612 Median :198.5 Median : -0.0048
## Mean :-0.4307 Mean :180.2702 Mean :181.2 Mean : -0.9128
## 3rd Qu.: 0.9929 3rd Qu.:255.9577 3rd Qu.:257.0 3rd Qu.: 2.5971
## Max. : 3.1409 Max. :359.9807 Max. :360.0 Max. : 358.5385
tmpWindatan_uw %>%
count(wdd, awdd=round(avgdir)) %>%
ggplot(aes(x=wdd, y=awdd)) +
geom_point(aes(size=n)) +
labs(x="Reported dominant wind direction (daily data)",
y="Calculated dominant wind direction (hourly data)",
title="Relationship between reported and calculated dominant wind direction",
subtitle="Unweighted by wind speed"
) +
scale_size_continuous("# days")
# Weighted by wind speed
tmpWindatan_wtd <- tmpOMHourly$tblHourly %>%
select(date, time, wd=winddirection_10m, ws=windspeed_10m) %>%
mutate(cosine=cos(2*pi*wd/360), sine=sin(2*pi*wd/360)) %>%
group_by(date) %>%
summarize(across(c(cosine, sine), .fns=function(x) sum(x*ws)/sum(ws)), sws=sum(ws), .groups="drop") %>%
mutate(arctangent=atan(sine/cosine),
arctangent2=atan2(y=sine, x=cosine),
avgdir=((arctangent2/2)*(360/pi)),
avgdir=ifelse(avgdir<0, 360+avgdir, avgdir),
avgspd=sws/24,
dist=sws*sqrt(cosine**2+sine**2)
) %>%
left_join(select(tmpOMDaily$tblDaily, date, wdd=winddirection_10m_dominant), by="date")
tmpWindatan_wtd
## # A tibble: 4,914 × 10
## date cosine sine sws arctangent arctang…¹ avgdir avgspd dist wdd
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2010-01-01 0.345 -0.917 463 -1.21 -1.21 291. 19.3 454. 291
## 2 2010-01-02 0.620 -0.757 478. -0.885 -0.885 309. 19.9 468. 309
## 3 2010-01-03 0.675 -0.722 447. -0.819 -0.819 313. 18.6 441. 313
## 4 2010-01-04 0.561 -0.823 461. -0.973 -0.973 304. 19.2 459. 304
## 5 2010-01-05 0.461 -0.884 392. -1.09 -1.09 298. 16.3 391. 298
## 6 2010-01-06 0.169 -0.945 251. -1.39 -1.39 280. 10.5 241. 280
## 7 2010-01-07 -0.250 -0.426 210 1.04 -2.10 240. 8.75 104. 240
## 8 2010-01-08 0.857 -0.418 471. -0.453 -0.453 334. 19.6 449. 334
## 9 2010-01-09 0.569 -0.798 359. -0.951 -0.951 306. 14.9 351. 305
## 10 2010-01-10 -0.643 -0.672 457. 0.808 -2.33 226. 19.0 425. 226
## # … with 4,904 more rows, and abbreviated variable name ¹arctangent2
tmpWindatan_wtd %>%
mutate(delta=avgdir-wdd) %>%
summary()
## date cosine sine sws
## Min. :2010-01-01 Min. :-0.99564 Min. :-0.9971 Min. : 85.3
## 1st Qu.:2013-05-13 1st Qu.:-0.64915 1st Qu.:-0.6081 1st Qu.:247.4
## Median :2016-09-22 Median :-0.11700 Median :-0.1535 Median :336.6
## Mean :2016-09-22 Mean :-0.07922 Mean :-0.1086 Mean :354.5
## 3rd Qu.:2020-02-02 3rd Qu.: 0.47777 3rd Qu.: 0.3654 3rd Qu.:436.6
## Max. :2023-06-15 Max. : 0.99606 Max. : 0.9855 Max. :933.4
## arctangent arctangent2 avgdir avgspd
## Min. :-1.5707 Min. :-3.1403 Min. : 0.015 Min. : 3.554
## 1st Qu.:-0.4878 1st Qu.:-2.1529 1st Qu.: 91.028 1st Qu.:10.309
## Median : 0.3097 Median :-0.7181 Median :198.011 Median :14.023
## Mean : 0.1642 Mean :-0.4630 Mean :180.505 Mean :14.770
## 3rd Qu.: 0.8461 3rd Qu.: 0.9509 3rd Qu.:257.184 3rd Qu.:18.191
## Max. : 1.5705 Max. : 3.1402 Max. :359.957 Max. :38.892
## dist wdd delta
## Min. : 3.001 Min. : 0.0 Min. :-359.9850
## 1st Qu.:186.211 1st Qu.: 91.0 1st Qu.: -0.2522
## Median :285.688 Median :198.0 Median : 0.0037
## Mean :301.524 Mean :180.7 Mean : -0.1459
## 3rd Qu.:399.462 3rd Qu.:257.0 3rd Qu.: 0.2536
## Max. :920.029 Max. :360.0 Max. : 1.5144
tmpWindatan_wtd %>%
count(wdd, awdd=round(avgdir)) %>%
ggplot(aes(x=wdd, y=awdd)) +
geom_point(aes(size=n)) +
labs(x="Reported dominant wind direction (daily data)",
y="Calculated dominant wind direction (hourly data)",
title="Relationship between reported and calculated dominant wind direction",
subtitle="Weighted by wind speed"
) +
scale_size_continuous("# days")
tmpWindatan_wtd %>%
count(rdist=round(dist), rspd=round(avgspd)) %>%
ggplot(aes(x=rspd, y=rdist/24)) +
geom_point(aes(size=n)) +
labs(title="Average wind speed (total and weighted by direction) by day",
x="Average wind speed per day",
y="Weighted average wind speed\n(total distance on average angle, divided by 24)"
) +
geom_abline(slope=1, intercept=0, lty=2) +
scale_size_continuous("# days")
tmpWindatan_wtd %>%
mutate(rdist=round(dist), rspd=round(avgspd)) %>%
ggplot(aes(x=rspd)) +
geom_boxplot(aes(y=dist/24/rspd, group=rspd), fill="lightblue") +
labs(title="Average wind speed (total and weighted by direction) by day",
x="Average wind speed per day",
y="Average weighted wind speed\n(as ratio of gross average)"
)
tmpWindatan_wtd %>%
filter(abs(avgdir-wdd)>1)
## # A tibble: 7 × 10
## date cosine sine sws arctangent arctang…¹ avgdir avgspd dist
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011-12-02 0.174 0.0000455 288. 0.000262 0.000262 1.50e-2 12.0 50.0
## 2 2013-07-31 0.0206 0.0414 182. 1.11 1.11 6.35e+1 7.58 8.41
## 3 2016-04-22 0.918 0.00197 406. 0.00215 0.00215 1.23e-1 16.9 373.
## 4 2017-07-31 -0.0306 -0.0569 86.6 1.08 -2.06 2.42e+2 3.61 5.59
## 5 2019-07-17 0.0352 0.0545 139. 0.998 0.998 5.72e+1 5.80 9.03
## 6 2020-11-22 -0.108 0.0523 228. -0.452 2.69 1.54e+2 9.48 27.3
## 7 2023-03-01 0.0482 0.0180 251. 0.358 0.358 2.05e+1 10.5 12.9
## # … with 1 more variable: wdd <int>, and abbreviated variable name ¹arctangent2
The wind direction averaging of hourly data, weighted by wind speed, is consistent with the reported dominant wind direction in the daily data.
Weather codes are explored:
# Hourly weather codes, evapotranspiration, and shortwave
dfHourlyCode <- tmpOMHourly$tblHourly %>%
select(time, hour, wc=weathercode, et=et0_fao_evapotranspiration, sw=shortwave_radiation) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyCode
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan wc 2
## 2 2010-01-01 00:00:00 0 2010 Jan et 0.02
## 3 2010-01-01 00:00:00 0 2010 Jan sw 0
## 4 2010-01-01 01:00:00 1 2010 Jan wc 1
## 5 2010-01-01 01:00:00 1 2010 Jan et 0.01
## 6 2010-01-01 01:00:00 1 2010 Jan sw 0
## 7 2010-01-01 02:00:00 2 2010 Jan wc 0
## 8 2010-01-01 02:00:00 2 2010 Jan et 0.01
## 9 2010-01-01 02:00:00 2 2010 Jan sw 0
## 10 2010-01-01 03:00:00 3 2010 Jan wc 0
## # … with 353,798 more rows
# Exploration of weather codes overall
dfHourlyCode %>%
filter(name=="wc", year<=2022) %>%
count(value) %>%
ggplot(aes(x=fct_rev(factor(value)), y=n/1000)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(n/1000, 1)), hjust=0, size=3) +
labs(title="Weather codes in hourly data (2010-2022)", y="Count (000)", x="Weather Code") +
coord_flip()
# Exploration of weather codes by month
dfHourlyCode %>%
filter(name=="wc", year<=2022) %>%
count(month, value) %>%
group_by(month) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=fct_rev(factor(value)), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=paste0(round(100*pct, 1), "%")), hjust=0, size=3) +
labs(title="Weather codes in hourly data (2010-2022)", y="Frequency", x="Weather Code") +
coord_flip() +
facet_wrap(~month)
# Exploration of weather codes by month
dfHourlyCode %>%
filter(name=="wc", year<=2022) %>%
mutate(wType=case_when(value<=3~"Dry",
value>=51 & value<=55~"Drizzle",
value>=61 & value<=65~"Rain",
value>=71 & value<=75~"Snow",
TRUE~"error"
)
) %>%
count(month, wType) %>%
group_by(month) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=month, y=pct)) +
geom_col(aes(fill=factor(wType, levels=c("Dry", "Drizzle", "Rain", "Snow"))), position="stack") +
labs(title="Precipitation types in hourly data (2010-2022)", y="Frequency", x=NULL) +
coord_flip() +
scale_fill_discrete("")
# Weather codes from WMO Code Table 4677 (select examples)
# 00 Cloud development not observed or not observable
# 01 Clouds generally dissolving or becoming less developed
# 02 State of sky on the whole unchanged
# 03 Clouds generally forming or developing
# 51 Drizzle, not freezing, continuous (slight)
# 53 Drizzle, not freezing, continuous (moderate)
# 55 Drizzle, not freezing, continuous (heavy)
# 61 Rain, not freezing, continuous (slight)
# 63 Rain, not freezing, continuous (moderate)
# 65 Rain, not freezing, continuous (heavy)
# 71 Continuous fall of snowflakes (slight)
# 73 Continuous fall of snowflakes (moderate)
# 75 Continuous fall of snowflakes (heavy)
Shortwave radiation is explored:
dfHourlyCode %>%
filter(name=="sw", year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(title="Average shortwave solar radiation over the past hour", x=NULL, y="Watts per sqaure meter")
dfHourlyCode %>%
filter(name=="sw", year<=2022) %>%
group_by(hour, month) %>%
summarize(value=mean(value), .groups="drop") %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_point() +
facet_wrap(~month) +
labs(title="Average hourly shortwave solar radiation by hour and month (2010-2022)",
x=NULL,
y="Watts per sqaure meter"
)
dfHourlyCode %>%
filter(name %in% c("sw"), year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric") %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0("Shortwave radiation", ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
Evapotranspiration is explored:
dfHourlyCode %>%
filter(name=="et", year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(title="Evapotranspiration of a well-watered grass field", x="Hour of day", y="mm")
dfHourlyCode %>%
filter(name=="et", year<=2022) %>%
group_by(hour, month) %>%
summarize(value=mean(value), .groups="drop") %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_point() +
facet_wrap(~month) +
labs(title="Mean evapotranspiration of a well-watered grass field by hour and month (2010-2022)",
x="Hour of day",
y="mm"
)
dfHourlyCode %>%
filter(name %in% c("et"), year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric") %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="proportion of time as max/min",
title=paste0("Evapotranspiration", ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
Cloud cover is explored:
# Create cloud cover data
dfHourlyCloud <- tmpOMHourly$tblHourly %>%
select(time, hour, contains("cloud")) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyCloud
## # A tibble: 471,744 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <int>
## 1 2010-01-01 00:00:00 0 2010 Jan cloudcover 62
## 2 2010-01-01 00:00:00 0 2010 Jan cloudcover_low 69
## 3 2010-01-01 00:00:00 0 2010 Jan cloudcover_mid 0
## 4 2010-01-01 00:00:00 0 2010 Jan cloudcover_high 0
## 5 2010-01-01 01:00:00 1 2010 Jan cloudcover 47
## 6 2010-01-01 01:00:00 1 2010 Jan cloudcover_low 52
## 7 2010-01-01 01:00:00 1 2010 Jan cloudcover_mid 0
## 8 2010-01-01 01:00:00 1 2010 Jan cloudcover_high 0
## 9 2010-01-01 02:00:00 2 2010 Jan cloudcover 20
## 10 2010-01-01 02:00:00 2 2010 Jan cloudcover_low 22
## # … with 471,734 more rows
# Boxplot for cloud cover types
for(keyMetric in unique(dfHourlyCloud$name)) {
p1 <- dfHourlyCloud %>%
filter(name==keyMetric, year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(x="Hour of day",
y="% sky covered with cloud",
title=paste0(keyMetric, ": % sky covered with cloud")
)
print(p1)
}
# Create max/min for metric
tmpDFCloud <- dfHourlyCloud %>%
filter(year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDFCloud
## # A tibble: 2,304 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 cloudcover Jan 0 isMax 0.300
## 2 cloudcover Jan 0 isMin 0.208
## 3 cloudcover Jan 1 isMax 0.273
## 4 cloudcover Jan 1 isMin 0.161
## 5 cloudcover Jan 2 isMax 0.266
## 6 cloudcover Jan 2 isMin 0.132
## 7 cloudcover Jan 3 isMax 0.283
## 8 cloudcover Jan 3 isMin 0.141
## 9 cloudcover Jan 4 isMax 0.298
## 10 cloudcover Jan 4 isMin 0.141
## # … with 2,294 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDFCloud$name)) {
p1 <- tmpDFCloud %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
print(p1)
}
Atmospheric pressure is explored:
# Create pressure data
dfHourlyPressure <- tmpOMHourly$tblHourly %>%
select(time, hour, contains("pressure")) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyPressure
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan pressure_msl 1024.
## 2 2010-01-01 00:00:00 0 2010 Jan surface_pressure 1001.
## 3 2010-01-01 00:00:00 0 2010 Jan vapor_pressure_deficit 0.1
## 4 2010-01-01 01:00:00 1 2010 Jan pressure_msl 1025.
## 5 2010-01-01 01:00:00 1 2010 Jan surface_pressure 1001.
## 6 2010-01-01 01:00:00 1 2010 Jan vapor_pressure_deficit 0.09
## 7 2010-01-01 02:00:00 2 2010 Jan pressure_msl 1025.
## 8 2010-01-01 02:00:00 2 2010 Jan surface_pressure 1002.
## 9 2010-01-01 02:00:00 2 2010 Jan vapor_pressure_deficit 0.08
## 10 2010-01-01 03:00:00 3 2010 Jan pressure_msl 1026.
## # … with 353,798 more rows
# Boxplot for pressure types
for(keyMetric in unique(dfHourlyPressure$name)) {
tmpUnits <- tmpOMHourly$tblUnits %>% filter(name==keyMetric) %>% pull(value)
p1 <- dfHourlyPressure %>%
filter(name==keyMetric, year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(x="Hour of day",
y=paste0(keyMetric, " (", tmpUnits, ")"),
title=paste0(keyMetric, ": ", tmpUnits)
)
print(p1)
}
dfHourlyPressure %>%
pivot_wider(id_cols=c(time, hour, year, month)) %>%
count(pressure_msl, surface_pressure) %>%
ggplot(aes(x=pressure_msl, y=surface_pressure)) +
geom_point(aes(size=n)) +
geom_smooth(aes(weight=n), method="lm") +
labs(title="Surface pressure vs. MSL", x="MSL", y="Surface Pressure")
## `geom_smooth()` using formula = 'y ~ x'
# Create max/min for metric
tmpDFPressure <- dfHourlyPressure %>%
filter(year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDFPressure
## # A tibble: 1,728 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 pressure_msl Jan 0 isMax 0.278
## 2 pressure_msl Jan 0 isMin 0.283
## 3 pressure_msl Jan 1 isMax 0.0298
## 4 pressure_msl Jan 1 isMin 0.0546
## 5 pressure_msl Jan 2 isMax 0.0149
## 6 pressure_msl Jan 2 isMin 0.0223
## 7 pressure_msl Jan 3 isMax 0.0248
## 8 pressure_msl Jan 3 isMin 0.0248
## 9 pressure_msl Jan 4 isMax 0.0174
## 10 pressure_msl Jan 4 isMin 0.0149
## # … with 1,718 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDFPressure$name)) {
p1 <- tmpDFPressure %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
print(p1)
}
Soil temperature is explored:
# Create soil temperature data
dfHourlySoilTemp <- tmpOMHourly$tblHourly %>%
select(time, date, hour, starts_with("soil_temp")) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, date, year, month, hour))
dfHourlySoilTemp
## # A tibble: 471,744 × 7
## time date hour year month name value
## <dttm> <date> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_temperature_0_to… -1.5
## 2 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_temperature_7_to… -0.4
## 3 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_temperature_28_t… 2.4
## 4 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_temperature_100_… 9
## 5 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_temperature_0_to… -1.6
## 6 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_temperature_7_to… -0.4
## 7 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_temperature_28_t… 2.4
## 8 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_temperature_100_… 9
## 9 2010-01-01 02:00:00 2010-01-01 2 2010 Jan soil_temperature_0_to… -1.8
## 10 2010-01-01 02:00:00 2010-01-01 2 2010 Jan soil_temperature_7_to… -0.4
## # … with 471,734 more rows
# Boxplot for soil temperature
for(keyMetric in unique(dfHourlySoilTemp$name)) {
tmpUnits <- tmpOMHourly$tblUnits %>% filter(name==keyMetric) %>% pull(value)
p1 <- dfHourlySoilTemp %>%
filter(name==keyMetric, year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(x="Hour of day",
y=paste0(keyMetric, " (", tmpUnits, ")"),
title=paste0(keyMetric, ": ", tmpUnits)
)
print(p1)
}
# Mean and standard deviation by month
dfHourlySoilTemp %>%
group_by(date, name) %>%
summarize(across(value, .fns=list(mu=mean, sigma=sd)), .groups="drop") %>%
mutate(doy=lubridate::yday(date)) %>%
group_by(doy, name) %>%
summarize(across(starts_with("value"), .fns=list(mu=mean)), .groups="drop") %>%
pivot_longer(cols=-c(doy, name), names_to="metric") %>%
ggplot(aes(x=doy, y=value)) +
geom_line(aes(group=name, color=stringr::str_replace(name, "soil_temperature_", ""))) +
facet_wrap(~c("value_mu_mu"="Daily mean", "value_sigma_mu"="Mean daily standard deviation")[metric],
nrow=2,
scales="free_y"
) +
labs(x="Day of Year",
y="Degrees (C)",
title="Soil temperature mean and average daily standard deviation"
) +
scale_color_discrete("Soil depth")
# Create max/min for metric
tmpDFSoilTemp <- dfHourlySoilTemp %>%
filter(year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDFSoilTemp
## # A tibble: 2,304 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 soil_temperature_0_to_7cm Jan 0 isMax 0.238
## 2 soil_temperature_0_to_7cm Jan 0 isMin 0.208
## 3 soil_temperature_0_to_7cm Jan 1 isMax 0.102
## 4 soil_temperature_0_to_7cm Jan 1 isMin 0.161
## 5 soil_temperature_0_to_7cm Jan 2 isMax 0.0645
## 6 soil_temperature_0_to_7cm Jan 2 isMin 0.154
## 7 soil_temperature_0_to_7cm Jan 3 isMax 0.0670
## 8 soil_temperature_0_to_7cm Jan 3 isMin 0.154
## 9 soil_temperature_0_to_7cm Jan 4 isMax 0.0620
## 10 soil_temperature_0_to_7cm Jan 4 isMin 0.166
## # … with 2,294 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDFSoilTemp$name)) {
p1 <- tmpDFSoilTemp %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
print(p1)
}
Soil moisture is explored:
# Create soil moisture data
dfHourlySoilMoist <- tmpOMHourly$tblHourly %>%
select(time, date, hour, starts_with("soil_moist")) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, date, year, month, hour))
dfHourlySoilMoist
## # A tibble: 471,744 × 7
## time date hour year month name value
## <dttm> <date> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_moisture_0_to_7cm 0.295
## 2 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_moisture_7_to_28… 0.3
## 3 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_moisture_28_to_1… 0.334
## 4 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_moisture_100_to_… 0.31
## 5 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_moisture_0_to_7cm 0.295
## 6 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_moisture_7_to_28… 0.3
## 7 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_moisture_28_to_1… 0.334
## 8 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_moisture_100_to_… 0.31
## 9 2010-01-01 02:00:00 2010-01-01 2 2010 Jan soil_moisture_0_to_7cm 0.294
## 10 2010-01-01 02:00:00 2010-01-01 2 2010 Jan soil_moisture_7_to_28… 0.3
## # … with 471,734 more rows
# Boxplot for soil moisture
for(keyMetric in unique(dfHourlySoilMoist$name)) {
tmpUnits <- tmpOMHourly$tblUnits %>% filter(name==keyMetric) %>% pull(value)
p1 <- dfHourlySoilMoist %>%
filter(name==keyMetric, year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(x="Hour of day",
y=paste0(keyMetric, " (", tmpUnits, ")"),
title=paste0(keyMetric, ": ", tmpUnits)
)
print(p1)
}
# Mean and standard deviation by month
dfHourlySoilMoist %>%
group_by(date, name) %>%
summarize(across(value, .fns=list(mu=mean, sigma=sd)), .groups="drop") %>%
mutate(doy=lubridate::yday(date)) %>%
group_by(doy, name) %>%
summarize(across(starts_with("value"), .fns=list(mu=mean)), .groups="drop") %>%
pivot_longer(cols=-c(doy, name), names_to="metric") %>%
ggplot(aes(x=doy, y=value)) +
geom_line(aes(group=name, color=stringr::str_replace(name, "soil_moisture_", ""))) +
facet_wrap(~c("value_mu_mu"="Daily mean", "value_sigma_mu"="Mean daily standard deviation")[metric],
nrow=2,
scales="free_y"
) +
labs(x="Day of Year",
y="cubic meters per cubic meter\n(volumetric mixing ratio)",
title="Soil moisture mean and average daily standard deviation"
) +
scale_color_discrete("Soil depth")
# Create max/min for metric
tmpDFSoilMoist <- dfHourlySoilMoist %>%
filter(year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDFSoilMoist
## # A tibble: 2,304 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 soil_moisture_0_to_7cm Jan 0 isMax 0.737
## 2 soil_moisture_0_to_7cm Jan 0 isMin 0.0844
## 3 soil_moisture_0_to_7cm Jan 1 isMax 0.529
## 4 soil_moisture_0_to_7cm Jan 1 isMin 0.0744
## 5 soil_moisture_0_to_7cm Jan 2 isMax 0.434
## 6 soil_moisture_0_to_7cm Jan 2 isMin 0.0794
## 7 soil_moisture_0_to_7cm Jan 3 isMax 0.375
## 8 soil_moisture_0_to_7cm Jan 3 isMin 0.0893
## 9 soil_moisture_0_to_7cm Jan 4 isMax 0.310
## 10 soil_moisture_0_to_7cm Jan 4 isMin 0.0918
## # … with 2,294 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDFSoilMoist$name)) {
p1 <- tmpDFSoilMoist %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
print(p1)
}
Metrics are explored for their variation over months and over hours of day:
# Sample database
tmpTemp <- tmpOMHourly$tblHourly %>%
select(time, date, temperature_2m) %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
hour=lubridate::hour(time),
fct_hour=factor(hour),
rndTemp=round(2*temperature_2m, 0)/2
)
tmpTemp
## # A tibble: 117,936 × 7
## time date temperature_2m month hour fct_hour rndTemp
## <dttm> <date> <dbl> <fct> <int> <fct> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 -9.5 Jan 0 0 -9.5
## 2 2010-01-01 01:00:00 2010-01-01 -9.8 Jan 1 1 -10
## 3 2010-01-01 02:00:00 2010-01-01 -10.3 Jan 2 2 -10.5
## 4 2010-01-01 03:00:00 2010-01-01 -10.8 Jan 3 3 -11
## 5 2010-01-01 04:00:00 2010-01-01 -11.3 Jan 4 4 -11.5
## 6 2010-01-01 05:00:00 2010-01-01 -11.8 Jan 5 5 -12
## 7 2010-01-01 06:00:00 2010-01-01 -12.3 Jan 6 6 -12.5
## 8 2010-01-01 07:00:00 2010-01-01 -12.8 Jan 7 7 -13
## 9 2010-01-01 08:00:00 2010-01-01 -13.2 Jan 8 8 -13
## 10 2010-01-01 09:00:00 2010-01-01 -13.4 Jan 9 9 -13.5
## # … with 117,926 more rows
# Simple predictive model for temperature/month
prdTemp <- tmpTemp %>%
count(rndTemp, month) %>%
arrange(rndTemp, desc(n)) %>%
group_by(rndTemp) %>%
mutate(corr=row_number()==1, pred=first(month)) %>%
ungroup()
prdTemp
## # A tibble: 828 × 5
## rndTemp month n corr pred
## <dbl> <fct> <int> <lgl> <fct>
## 1 -30 Jan 4 TRUE Jan
## 2 -29.5 Jan 6 TRUE Jan
## 3 -29 Jan 4 TRUE Jan
## 4 -28.5 Jan 6 TRUE Jan
## 5 -28 Jan 2 TRUE Jan
## 6 -27.5 Jan 1 TRUE Jan
## 7 -27 Jan 6 TRUE Jan
## 8 -26.5 Jan 4 TRUE Jan
## 9 -26 Jan 10 TRUE Jan
## 10 -25.5 Jan 15 TRUE Jan
## # … with 818 more rows
# Confusion matrix and accuracy
prdTemp %>%
count(month, corr, wt=n) %>%
pivot_wider(id_cols=month, names_from=corr, values_from=n, values_fill=0) %>%
bind_rows(summarize(., across(where(is.numeric), sum)) %>%
mutate(month="All") %>%
select(month, everything())
) %>%
mutate(n=`TRUE`+`FALSE`,
pctCorrect=`TRUE`/n,
pctNaive=ifelse(month=="All", 1/(nrow(.)-1), 2*n/sum(n)),
lift=pctCorrect/pctNaive
)
## # A tibble: 13 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 4215 6201 10416 0.595 0.0883 6.74
## 2 Feb 8404 1076 9480 0.114 0.0804 1.41
## 3 Mar 8817 1599 10416 0.154 0.0883 1.74
## 4 Apr 6972 3108 10080 0.308 0.0855 3.61
## 5 May 8584 1832 10416 0.176 0.0883 1.99
## 6 Jun 8535 1185 9720 0.122 0.0824 1.48
## 7 Jul 6524 3148 9672 0.325 0.0820 3.97
## 8 Aug 4335 5337 9672 0.552 0.0820 6.73
## 9 Sep 6888 2472 9360 0.264 0.0794 3.33
## 10 Oct 7248 2424 9672 0.251 0.0820 3.06
## 11 Nov 9360 0 9360 0 0.0794 0
## 12 Dec 7130 2542 9672 0.263 0.0820 3.20
## 13 All 87012 30924 117936 0.262 0.0833 3.15
prdTemp %>%
count(month, pred, corr, wt=n) %>%
ggplot(aes(x=month, y=pred)) +
labs(x="Actual month", y="Predicted month", title="Actual vs. predicted month using temperature") +
geom_text(aes(label=n)) +
geom_tile(aes(fill=corr), alpha=0.25)
The simple predictive model is converted to functional form:
simpleOneVarPredict <- function(df,
tgt,
prd,
nPrint=30,
showPlot=TRUE,
returnData=TRUE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble with key elements
# tgt: target variable
# prd: predictor variable
# nPrint: maximum number of lines of confusion matrix to print
# 0 means do not print any summary statistics
# showPlot: boolean, should overlap plot be created and shown?
# Counts of predictor to target variable
dfPred <- df %>%
group_by(across(all_of(c(prd, tgt)))) %>%
summarize(n=n(), .groups="drop") %>%
arrange(across(all_of(prd)), desc(n)) %>%
group_by(across(all_of(prd))) %>%
mutate(correct=row_number()==1, predicted=first(get(tgt))) %>%
ungroup()
# Confusion matrix and accuracy
dfConf <- dfPred %>%
group_by(across(all_of(c(tgt, "correct")))) %>%
summarize(n=sum(n), .groups="drop") %>%
pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
mutate(n=`TRUE`+`FALSE`,
pctCorrect=`TRUE`/n,
pctNaive=1/(nrow(.)),
lift=pctCorrect/pctNaive-1
)
# Overall confusion matrix
dfConfAll <- dfConf %>%
summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
mutate(pctCorrect=`TRUE`/n,
pctNaive=nMax/n,
lift=pctCorrect/pctNaive-1,
nBucket=length(unique(dfPred[[prd]]))
)
# Print confusion matrices
if(nPrint > 0) {
cat("\nAccuracy by target subgroup:\n")
dfConf %>% print(n=nPrint)
cat("\nOverall Accuracy:\n")
dfConfAll %>% print(n=nPrint)
}
# Plot of overlaps
if(isTRUE(showPlot)) {
p1 <- dfPred %>%
group_by(across(c(all_of(tgt), "predicted", "correct"))) %>%
summarize(n=sum(n), .groups="drop") %>%
ggplot(aes(x=get(tgt), y=predicted)) +
labs(x="Actual",
y="Predicted",
title=paste0("Actual vs. predicted ", tgt),
subtitle=paste0("(using ", prd, ")")
) +
geom_text(aes(label=n)) +
geom_tile(aes(fill=correct), alpha=0.25)
print(p1)
}
# Return data if requested
if(isTRUE(returnData)) list(dfPred=dfPred, dfConf=dfConf, dfConfAll=dfConfAll)
}
tstFunc <- simpleOneVarPredict(tmpTemp, tgt="month", prd="rndTemp")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(tgt)
##
## # Now:
## data %>% select(all_of(tgt))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
##
## Accuracy by target subgroup:
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 4215 6201 10416 0.595 0.0833 6.14
## 2 Feb 8404 1076 9480 0.114 0.0833 0.362
## 3 Mar 8817 1599 10416 0.154 0.0833 0.842
## 4 Apr 6972 3108 10080 0.308 0.0833 2.7
## 5 May 8584 1832 10416 0.176 0.0833 1.11
## 6 Jun 8535 1185 9720 0.122 0.0833 0.463
## 7 Jul 6524 3148 9672 0.325 0.0833 2.91
## 8 Aug 4335 5337 9672 0.552 0.0833 5.62
## 9 Sep 6888 2472 9360 0.264 0.0833 2.17
## 10 Oct 7248 2424 9672 0.251 0.0833 2.01
## 11 Nov 9360 0 9360 0 0.0833 -1
## 12 Dec 7130 2542 9672 0.263 0.0833 2.15
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 87012 30924 117936 0.262 0.0883 1.97 134
tstFunc
## $dfPred
## # A tibble: 828 × 5
## rndTemp month n correct predicted
## <dbl> <fct> <int> <lgl> <fct>
## 1 -30 Jan 4 TRUE Jan
## 2 -29.5 Jan 6 TRUE Jan
## 3 -29 Jan 4 TRUE Jan
## 4 -28.5 Jan 6 TRUE Jan
## 5 -28 Jan 2 TRUE Jan
## 6 -27.5 Jan 1 TRUE Jan
## 7 -27 Jan 6 TRUE Jan
## 8 -26.5 Jan 4 TRUE Jan
## 9 -26 Jan 10 TRUE Jan
## 10 -25.5 Jan 15 TRUE Jan
## # … with 818 more rows
##
## $dfConf
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 4215 6201 10416 0.595 0.0833 6.14
## 2 Feb 8404 1076 9480 0.114 0.0833 0.362
## 3 Mar 8817 1599 10416 0.154 0.0833 0.842
## 4 Apr 6972 3108 10080 0.308 0.0833 2.7
## 5 May 8584 1832 10416 0.176 0.0833 1.11
## 6 Jun 8535 1185 9720 0.122 0.0833 0.463
## 7 Jul 6524 3148 9672 0.325 0.0833 2.91
## 8 Aug 4335 5337 9672 0.552 0.0833 5.62
## 9 Sep 6888 2472 9360 0.264 0.0833 2.17
## 10 Oct 7248 2424 9672 0.251 0.0833 2.01
## 11 Nov 9360 0 9360 0 0.0833 -1
## 12 Dec 7130 2542 9672 0.263 0.0833 2.15
##
## $dfConfAll
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 87012 30924 117936 0.262 0.0883 1.97 134
all.equal(tstFunc$dfPred, rename(prdTemp, correct=corr, predicted=pred))
## [1] TRUE
The function is tested for predictive power of numeric variables, bucketed in to percentiles, on month:
# Create percentiles for numeric variables
tmpTemp <- tmpOMHourly$tblHourly %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
hour=lubridate::hour(time),
fct_hour=factor(hour),
across(where(is.numeric), .fns=function(x) round(100*percent_rank(x)), .names="pct_{.col}")
)
tmpTemp
## # A tibble: 117,936 × 73
## time date hour temper…¹ relat…² dewpo…³ appar…⁴ press…⁵
## <dttm> <date> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 0 -9.5 67 -14.4 -15.8 1024.
## 2 2010-01-01 01:00:00 2010-01-01 1 -9.8 69 -14.4 -16.3 1025.
## 3 2010-01-01 02:00:00 2010-01-01 2 -10.3 73 -14.2 -16.8 1025.
## 4 2010-01-01 03:00:00 2010-01-01 3 -10.8 74 -14.5 -17.2 1026.
## 5 2010-01-01 04:00:00 2010-01-01 4 -11.3 75 -14.8 -17.7 1026.
## 6 2010-01-01 05:00:00 2010-01-01 5 -11.8 76 -15.1 -18.2 1026.
## 7 2010-01-01 06:00:00 2010-01-01 6 -12.3 77 -15.5 -18.6 1027.
## 8 2010-01-01 07:00:00 2010-01-01 7 -12.8 78 -15.8 -19 1028.
## 9 2010-01-01 08:00:00 2010-01-01 8 -13.2 79 -16.1 -19.4 1028.
## 10 2010-01-01 09:00:00 2010-01-01 9 -13.4 78 -16.3 -19.6 1028.
## # … with 117,926 more rows, 65 more variables: surface_pressure <dbl>,
## # precipitation <dbl>, rain <dbl>, snowfall <dbl>, cloudcover <int>,
## # cloudcover_low <int>, cloudcover_mid <int>, cloudcover_high <int>,
## # shortwave_radiation <dbl>, direct_radiation <dbl>,
## # direct_normal_irradiance <dbl>, diffuse_radiation <dbl>,
## # windspeed_10m <dbl>, windspeed_100m <dbl>, winddirection_10m <int>,
## # winddirection_100m <int>, windgusts_10m <dbl>, …
# Get key variable names
tmpNames <- tmpTemp %>%
select(starts_with("pct")) %>%
names()
tmpNames
## [1] "pct_hour" "pct_temperature_2m"
## [3] "pct_relativehumidity_2m" "pct_dewpoint_2m"
## [5] "pct_apparent_temperature" "pct_pressure_msl"
## [7] "pct_surface_pressure" "pct_precipitation"
## [9] "pct_rain" "pct_snowfall"
## [11] "pct_cloudcover" "pct_cloudcover_low"
## [13] "pct_cloudcover_mid" "pct_cloudcover_high"
## [15] "pct_shortwave_radiation" "pct_direct_radiation"
## [17] "pct_direct_normal_irradiance" "pct_diffuse_radiation"
## [19] "pct_windspeed_10m" "pct_windspeed_100m"
## [21] "pct_winddirection_10m" "pct_winddirection_100m"
## [23] "pct_windgusts_10m" "pct_et0_fao_evapotranspiration"
## [25] "pct_weathercode" "pct_vapor_pressure_deficit"
## [27] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [29] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [31] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [33] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
# Get the key predictive metrics
tmpDFR <- map_dfr(.x=tmpNames,
.f=function(x) simpleOneVarPredict(tmpTemp, tgt="month", prd=x, nPrint=0, showPlot=FALSE)$dfConfAll
) %>%
mutate(vrbl=tmpNames) %>%
arrange(desc(lift))
# Print and plot lift by variable
tmpDFR %>%
print(n=50)
## # A tibble: 34 × 9
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket vrbl
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 10416 64759 53177 117936 0.451 0.0883 4.11 101 pct_soil_temp…
## 2 10416 64819 53117 117936 0.450 0.0883 4.10 101 pct_soil_temp…
## 3 10416 74369 43567 117936 0.369 0.0883 3.18 101 pct_soil_temp…
## 4 10416 81800 36136 117936 0.306 0.0883 2.47 66 pct_soil_mois…
## 5 10416 83423 34513 117936 0.293 0.0883 2.31 101 pct_soil_temp…
## 6 10416 86085 31851 117936 0.270 0.0883 2.06 96 pct_soil_mois…
## 7 10416 86997 30939 117936 0.262 0.0883 1.97 101 pct_temperatu…
## 8 10416 87078 30858 117936 0.262 0.0883 1.96 101 pct_apparent_…
## 9 10416 89320 28616 117936 0.243 0.0883 1.75 101 pct_dewpoint_…
## 10 10416 92788 25148 117936 0.213 0.0883 1.41 100 pct_soil_mois…
## 11 10416 94975 22961 117936 0.195 0.0883 1.20 101 pct_soil_mois…
## 12 10416 95744 22192 117936 0.188 0.0883 1.13 79 pct_vapor_pre…
## 13 10416 100072 17864 117936 0.151 0.0883 0.715 101 pct_pressure_…
## 14 10416 100766 17170 117936 0.146 0.0883 0.648 101 pct_surface_p…
## 15 10416 101246 16690 117936 0.142 0.0883 0.602 40 pct_et0_fao_e…
## 16 10416 102676 15260 117936 0.129 0.0883 0.465 101 pct_winddirec…
## 17 10416 102820 15116 117936 0.128 0.0883 0.451 101 pct_winddirec…
## 18 10416 102853 15083 117936 0.128 0.0883 0.448 64 pct_cloudcover
## 19 10416 102996 14940 117936 0.127 0.0883 0.434 46 pct_cloudcove…
## 20 10416 103110 14826 117936 0.126 0.0883 0.423 55 pct_diffuse_r…
## 21 10416 103169 14767 117936 0.125 0.0883 0.418 55 pct_shortwave…
## 22 10416 103186 14750 117936 0.125 0.0883 0.416 101 pct_windspeed…
## 23 10416 103197 14739 117936 0.125 0.0883 0.415 12 pct_weatherco…
## 24 10416 103299 14637 117936 0.124 0.0883 0.405 101 pct_windspeed…
## 25 10416 103594 14342 117936 0.122 0.0883 0.377 50 pct_direct_ra…
## 26 10416 103778 14158 117936 0.120 0.0883 0.359 50 pct_direct_no…
## 27 10416 103800 14136 117936 0.120 0.0883 0.357 50 pct_cloudcove…
## 28 10416 103861 14075 117936 0.119 0.0883 0.351 97 pct_windgusts…
## 29 10416 105069 12867 117936 0.109 0.0883 0.235 60 pct_relativeh…
## 30 10416 105133 12803 117936 0.109 0.0883 0.229 41 pct_cloudcove…
## 31 10416 106183 11753 117936 0.0997 0.0883 0.128 5 pct_snowfall
## 32 10416 106324 11612 117936 0.0985 0.0883 0.115 12 pct_rain
## 33 10416 106924 11012 117936 0.0934 0.0883 0.0572 13 pct_precipita…
## 34 10416 107520 10416 117936 0.0883 0.0883 0 24 pct_hour
tmpDFR %>%
ggplot(aes(x=fct_reorder(stringr::str_replace_all(vrbl, "pct_", ""), lift), y=lift)) +
geom_col(fill="lightblue") +
coord_flip() +
labs(x=NULL, y="lift", title="Lift by hourly variable percentile in predicting month")
# Example for soil temperature and high clouds
simpleOneVarPredict(tmpTemp, tgt="month", prd="pct_soil_temperature_100_to_255cm", returnData=FALSE)
##
## Accuracy by target subgroup:
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5395 5021 10416 0.482 0.0833 4.78
## 2 Feb 5094 4386 9480 0.463 0.0833 4.55
## 3 Mar 3039 7377 10416 0.708 0.0833 7.50
## 4 Apr 8862 1218 10080 0.121 0.0833 0.45
## 5 May 7343 3073 10416 0.295 0.0833 2.54
## 6 Jun 5365 4355 9720 0.448 0.0833 4.38
## 7 Jul 4445 5227 9672 0.540 0.0833 5.49
## 8 Aug 4762 4910 9672 0.508 0.0833 5.09
## 9 Sep 2744 6616 9360 0.707 0.0833 7.48
## 10 Oct 6858 2814 9672 0.291 0.0833 2.49
## 11 Nov 5822 3538 9360 0.378 0.0833 3.54
## 12 Dec 5030 4642 9672 0.480 0.0833 4.76
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 64759 53177 117936 0.451 0.0883 4.11 101
simpleOneVarPredict(tmpTemp, tgt="month", prd="pct_cloudcover_high", returnData=FALSE)
##
## Accuracy by target subgroup:
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5425 4991 10416 0.479 0.0833 4.75
## 2 Feb 9480 0 9480 0 0.0833 -1
## 3 Mar 9807 609 10416 0.0585 0.0833 -0.298
## 4 Apr 9963 117 10080 0.0116 0.0833 -0.861
## 5 May 6551 3865 10416 0.371 0.0833 3.45
## 6 Jun 9345 375 9720 0.0386 0.0833 -0.537
## 7 Jul 8338 1334 9672 0.138 0.0833 0.655
## 8 Aug 8467 1205 9672 0.125 0.0833 0.495
## 9 Sep 9360 0 9360 0 0.0833 -1
## 10 Oct 9529 143 9672 0.0148 0.0833 -0.823
## 11 Nov 9360 0 9360 0 0.0833 -1
## 12 Dec 9508 164 9672 0.0170 0.0833 -0.797
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 105133 12803 117936 0.109 0.0883 0.229 41
The function is tested for predictive power of numeric variables, bucketed in to percentiles, on hour:
# Remove hour from tmpNames
tmpNamesNoHour <- setdiff(tmpNames, "pct_hour")
tmpNamesNoHour
## [1] "pct_temperature_2m" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_weathercode"
## [25] "pct_vapor_pressure_deficit" "pct_soil_temperature_0_to_7cm"
## [27] "pct_soil_temperature_7_to_28cm" "pct_soil_temperature_28_to_100cm"
## [29] "pct_soil_temperature_100_to_255cm" "pct_soil_moisture_0_to_7cm"
## [31] "pct_soil_moisture_7_to_28cm" "pct_soil_moisture_28_to_100cm"
## [33] "pct_soil_moisture_100_to_255cm"
# Get the key predictive metrics
tmpDFRHour <- map_dfr(.x=tmpNamesNoHour,
.f=function(x) simpleOneVarPredict(tmpTemp, tgt="fct_hour", prd=x, nPrint=0, showPlot=FALSE)$dfConfAll
) %>%
mutate(vrbl=tmpNamesNoHour) %>%
arrange(desc(lift))
# Print and plot lift by variable
tmpDFRHour %>%
print(n=50)
## # A tibble: 33 × 9
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket vrbl
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 4914 102257 15679 117936 0.133 0.0417 2.19 55 pct_diffuse_r…
## 2 4914 102824 15112 117936 0.128 0.0417 2.08 55 pct_shortwave…
## 3 4914 105224 12712 117936 0.108 0.0417 1.59 50 pct_direct_ra…
## 4 4914 105893 12043 117936 0.102 0.0417 1.45 50 pct_direct_no…
## 5 4914 106137 11799 117936 0.100 0.0417 1.40 40 pct_et0_fao_e…
## 6 4914 109462 8474 117936 0.0719 0.0417 0.724 60 pct_relativeh…
## 7 4914 109962 7974 117936 0.0676 0.0417 0.623 79 pct_vapor_pre…
## 8 4914 110201 7735 117936 0.0656 0.0417 0.574 101 pct_soil_temp…
## 9 4914 110597 7339 117936 0.0622 0.0417 0.493 97 pct_windgusts…
## 10 4914 110778 7158 117936 0.0607 0.0417 0.457 101 pct_windspeed…
## 11 4914 110854 7082 117936 0.0600 0.0417 0.441 101 pct_temperatu…
## 12 4914 110941 6995 117936 0.0593 0.0417 0.423 101 pct_winddirec…
## 13 4914 110958 6978 117936 0.0592 0.0417 0.420 101 pct_windspeed…
## 14 4914 110983 6953 117936 0.0590 0.0417 0.415 101 pct_apparent_…
## 15 4914 111081 6855 117936 0.0581 0.0417 0.395 101 pct_winddirec…
## 16 4914 111522 6414 117936 0.0544 0.0417 0.305 101 pct_pressure_…
## 17 4914 111548 6388 117936 0.0542 0.0417 0.300 101 pct_surface_p…
## 18 4914 111600 6336 117936 0.0537 0.0417 0.289 64 pct_cloudcover
## 19 4914 111643 6293 117936 0.0534 0.0417 0.281 101 pct_dewpoint_…
## 20 4914 111667 6269 117936 0.0532 0.0417 0.276 101 pct_soil_temp…
## 21 4914 111831 6105 117936 0.0518 0.0417 0.242 46 pct_cloudcove…
## 22 4914 111846 6090 117936 0.0516 0.0417 0.239 50 pct_cloudcove…
## 23 4914 111908 6028 117936 0.0511 0.0417 0.227 101 pct_soil_mois…
## 24 4914 112038 5898 117936 0.0500 0.0417 0.200 100 pct_soil_mois…
## 25 4914 112197 5739 117936 0.0487 0.0417 0.168 12 pct_weatherco…
## 26 4914 112274 5662 117936 0.0480 0.0417 0.152 41 pct_cloudcove…
## 27 4914 112537 5399 117936 0.0458 0.0417 0.0987 96 pct_soil_mois…
## 28 4914 112566 5370 117936 0.0455 0.0417 0.0928 101 pct_soil_temp…
## 29 4914 112687 5249 117936 0.0445 0.0417 0.0682 12 pct_rain
## 30 4914 112691 5245 117936 0.0445 0.0417 0.0674 13 pct_precipita…
## 31 4914 112702 5234 117936 0.0444 0.0417 0.0651 101 pct_soil_temp…
## 32 4914 112803 5133 117936 0.0435 0.0417 0.0446 66 pct_soil_mois…
## 33 4914 112966 4970 117936 0.0421 0.0417 0.0114 5 pct_snowfall
tmpDFRHour %>%
ggplot(aes(x=fct_reorder(stringr::str_replace_all(vrbl, "pct_", ""), lift), y=lift)) +
geom_col(fill="lightblue") +
coord_flip() +
labs(x=NULL, y="lift", title="Lift by hourly variable percentile in predicting hour")
# Example for diffuse radiation and soil moisture
simpleOneVarPredict(tmpTemp, tgt="fct_hour", prd="pct_diffuse_radiation", returnData=FALSE)
##
## Accuracy by target subgroup:
## # A tibble: 24 × 7
## fct_hour `TRUE` `FALSE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 0 4914 0 4914 1 0.0417 23
## 2 1 0 4914 4914 0 0.0417 -1
## 3 2 0 4914 4914 0 0.0417 -1
## 4 3 0 4914 4914 0 0.0417 -1
## 5 4 0 4914 4914 0 0.0417 -1
## 6 5 0 4914 4914 0 0.0417 -1
## 7 6 554 4360 4914 0.113 0.0417 1.71
## 8 7 671 4243 4914 0.137 0.0417 2.28
## 9 8 735 4179 4914 0.150 0.0417 2.59
## 10 9 861 4053 4914 0.175 0.0417 3.21
## 11 10 763 4151 4914 0.155 0.0417 2.73
## 12 11 884 4030 4914 0.180 0.0417 3.32
## 13 12 614 4300 4914 0.125 0.0417 2.00
## 14 13 1130 3784 4914 0.230 0.0417 4.52
## 15 14 1395 3519 4914 0.284 0.0417 5.81
## 16 15 676 4238 4914 0.138 0.0417 2.30
## 17 16 150 4764 4914 0.0305 0.0417 -0.267
## 18 17 0 4914 4914 0 0.0417 -1
## 19 18 324 4590 4914 0.0659 0.0417 0.582
## 20 19 427 4487 4914 0.0869 0.0417 1.09
## 21 20 742 4172 4914 0.151 0.0417 2.62
## 22 21 839 4075 4914 0.171 0.0417 3.10
## 23 22 0 4914 4914 0 0.0417 -1
## 24 23 0 4914 4914 0 0.0417 -1
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 4914 102257 15679 117936 0.133 0.0417 2.19 55
simpleOneVarPredict(tmpTemp, tgt="fct_hour", prd="pct_soil_moisture_100_to_255cm", returnData=FALSE)
##
## Accuracy by target subgroup:
## # A tibble: 24 × 7
## fct_hour `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 0 4518 396 4914 0.0806 0.0417 0.934
## 2 1 4687 227 4914 0.0462 0.0417 0.109
## 3 2 4884 30 4914 0.00611 0.0417 -0.853
## 4 3 4673 241 4914 0.0490 0.0417 0.177
## 5 4 4739 175 4914 0.0356 0.0417 -0.145
## 6 5 4588 326 4914 0.0663 0.0417 0.592
## 7 6 4756 158 4914 0.0322 0.0417 -0.228
## 8 7 4683 231 4914 0.0470 0.0417 0.128
## 9 8 4620 294 4914 0.0598 0.0417 0.436
## 10 9 4619 295 4914 0.0600 0.0417 0.441
## 11 10 4616 298 4914 0.0606 0.0417 0.455
## 12 11 4766 148 4914 0.0301 0.0417 -0.277
## 13 12 4825 89 4914 0.0181 0.0417 -0.565
## 14 13 4539 375 4914 0.0763 0.0417 0.832
## 15 14 4882 32 4914 0.00651 0.0417 -0.844
## 16 15 4611 303 4914 0.0617 0.0417 0.480
## 17 16 4730 184 4914 0.0374 0.0417 -0.101
## 18 17 4825 89 4914 0.0181 0.0417 -0.565
## 19 18 4422 492 4914 0.100 0.0417 1.40
## 20 19 4774 140 4914 0.0285 0.0417 -0.316
## 21 20 4788 126 4914 0.0256 0.0417 -0.385
## 22 21 4533 381 4914 0.0775 0.0417 0.861
## 23 22 4865 49 4914 0.00997 0.0417 -0.761
## 24 23 4860 54 4914 0.0110 0.0417 -0.736
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 4914 112803 5133 117936 0.0435 0.0417 0.0446 66
Random variables, split equally 0-5, 0-25, and 0-100, are included as an example null state:
# Add random variables
set.seed(23072413)
tmpTemp <- tmpTemp %>%
mutate(rnd005=sample(0:5, size=n(), replace=TRUE),
rnd025=sample(0:25, size=n(), replace=TRUE),
rnd100=sample(0:100, size=n(), replace=TRUE)
)
# Get key variable names
tmpNames_v2 <- tmpTemp %>%
select(starts_with("pct"), starts_with("rnd")) %>%
names()
tmpNames_v2
## [1] "pct_hour" "pct_temperature_2m"
## [3] "pct_relativehumidity_2m" "pct_dewpoint_2m"
## [5] "pct_apparent_temperature" "pct_pressure_msl"
## [7] "pct_surface_pressure" "pct_precipitation"
## [9] "pct_rain" "pct_snowfall"
## [11] "pct_cloudcover" "pct_cloudcover_low"
## [13] "pct_cloudcover_mid" "pct_cloudcover_high"
## [15] "pct_shortwave_radiation" "pct_direct_radiation"
## [17] "pct_direct_normal_irradiance" "pct_diffuse_radiation"
## [19] "pct_windspeed_10m" "pct_windspeed_100m"
## [21] "pct_winddirection_10m" "pct_winddirection_100m"
## [23] "pct_windgusts_10m" "pct_et0_fao_evapotranspiration"
## [25] "pct_weathercode" "pct_vapor_pressure_deficit"
## [27] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [29] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [31] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [33] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
## [35] "rnd005" "rnd025"
## [37] "rnd100"
# Get the key predictive metrics
tmpDFR_v2 <- map_dfr(.x=tmpNames_v2,
.f=function(x) simpleOneVarPredict(tmpTemp, tgt="month", prd=x, nPrint=0, showPlot=FALSE)$dfConfAll
) %>%
mutate(vrbl=tmpNames_v2) %>%
arrange(desc(lift))
# Print and plot lift by variable
tmpDFR_v2 %>%
print(n=50)
## # A tibble: 37 × 9
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket vrbl
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 10416 64759 53177 117936 0.451 0.0883 4.11 101 pct_soil_temp…
## 2 10416 64819 53117 117936 0.450 0.0883 4.10 101 pct_soil_temp…
## 3 10416 74369 43567 117936 0.369 0.0883 3.18 101 pct_soil_temp…
## 4 10416 81800 36136 117936 0.306 0.0883 2.47 66 pct_soil_mois…
## 5 10416 83423 34513 117936 0.293 0.0883 2.31 101 pct_soil_temp…
## 6 10416 86085 31851 117936 0.270 0.0883 2.06 96 pct_soil_mois…
## 7 10416 86997 30939 117936 0.262 0.0883 1.97 101 pct_temperatu…
## 8 10416 87078 30858 117936 0.262 0.0883 1.96 101 pct_apparent_…
## 9 10416 89320 28616 117936 0.243 0.0883 1.75 101 pct_dewpoint_…
## 10 10416 92788 25148 117936 0.213 0.0883 1.41 100 pct_soil_mois…
## 11 10416 94975 22961 117936 0.195 0.0883 1.20 101 pct_soil_mois…
## 12 10416 95744 22192 117936 0.188 0.0883 1.13 79 pct_vapor_pre…
## 13 10416 100072 17864 117936 0.151 0.0883 0.715 101 pct_pressure_…
## 14 10416 100766 17170 117936 0.146 0.0883 0.648 101 pct_surface_p…
## 15 10416 101246 16690 117936 0.142 0.0883 0.602 40 pct_et0_fao_e…
## 16 10416 102676 15260 117936 0.129 0.0883 0.465 101 pct_winddirec…
## 17 10416 102820 15116 117936 0.128 0.0883 0.451 101 pct_winddirec…
## 18 10416 102853 15083 117936 0.128 0.0883 0.448 64 pct_cloudcover
## 19 10416 102996 14940 117936 0.127 0.0883 0.434 46 pct_cloudcove…
## 20 10416 103110 14826 117936 0.126 0.0883 0.423 55 pct_diffuse_r…
## 21 10416 103169 14767 117936 0.125 0.0883 0.418 55 pct_shortwave…
## 22 10416 103186 14750 117936 0.125 0.0883 0.416 101 pct_windspeed…
## 23 10416 103197 14739 117936 0.125 0.0883 0.415 12 pct_weatherco…
## 24 10416 103299 14637 117936 0.124 0.0883 0.405 101 pct_windspeed…
## 25 10416 103594 14342 117936 0.122 0.0883 0.377 50 pct_direct_ra…
## 26 10416 103778 14158 117936 0.120 0.0883 0.359 50 pct_direct_no…
## 27 10416 103800 14136 117936 0.120 0.0883 0.357 50 pct_cloudcove…
## 28 10416 103861 14075 117936 0.119 0.0883 0.351 97 pct_windgusts…
## 29 10416 105069 12867 117936 0.109 0.0883 0.235 60 pct_relativeh…
## 30 10416 105133 12803 117936 0.109 0.0883 0.229 41 pct_cloudcove…
## 31 10416 106183 11753 117936 0.0997 0.0883 0.128 5 pct_snowfall
## 32 10416 106299 11637 117936 0.0987 0.0883 0.117 101 rnd100
## 33 10416 106324 11612 117936 0.0985 0.0883 0.115 12 pct_rain
## 34 10416 106924 11012 117936 0.0934 0.0883 0.0572 13 pct_precipita…
## 35 10416 106943 10993 117936 0.0932 0.0883 0.0554 26 rnd025
## 36 10416 107317 10619 117936 0.0900 0.0883 0.0195 6 rnd005
## 37 10416 107520 10416 117936 0.0883 0.0883 0 24 pct_hour
tmpDFR_v2 %>%
mutate(fillColor=ifelse(str_detect(vrbl, pattern="pct_"), "lightblue", "red")) %>%
ggplot(aes(x=fct_reorder(stringr::str_replace_all(vrbl, "pct_", ""), lift), y=lift)) +
geom_col(aes(fill=fillColor)) +
coord_flip() +
labs(x=NULL, y="lift", title="Lift by hourly variable percentile in predicting month") +
scale_fill_identity()
# Remove hour from tmpNames
tmpNamesNoHour_v2 <- setdiff(tmpNames_v2, "pct_hour")
tmpNamesNoHour_v2
## [1] "pct_temperature_2m" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_weathercode"
## [25] "pct_vapor_pressure_deficit" "pct_soil_temperature_0_to_7cm"
## [27] "pct_soil_temperature_7_to_28cm" "pct_soil_temperature_28_to_100cm"
## [29] "pct_soil_temperature_100_to_255cm" "pct_soil_moisture_0_to_7cm"
## [31] "pct_soil_moisture_7_to_28cm" "pct_soil_moisture_28_to_100cm"
## [33] "pct_soil_moisture_100_to_255cm" "rnd005"
## [35] "rnd025" "rnd100"
# Get the key predictive metrics
tmpDFRHour_v2 <- map_dfr(.x=tmpNamesNoHour_v2,
.f=function(x) simpleOneVarPredict(tmpTemp, tgt="fct_hour", prd=x, nPrint=0, showPlot=FALSE)$dfConfAll
) %>%
mutate(vrbl=tmpNamesNoHour_v2) %>%
arrange(desc(lift))
# Print and plot lift by variable
tmpDFRHour_v2 %>%
print(n=50)
## # A tibble: 36 × 9
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket vrbl
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 4914 102257 15679 117936 0.133 0.0417 2.19 55 pct_diffuse_r…
## 2 4914 102824 15112 117936 0.128 0.0417 2.08 55 pct_shortwave…
## 3 4914 105224 12712 117936 0.108 0.0417 1.59 50 pct_direct_ra…
## 4 4914 105893 12043 117936 0.102 0.0417 1.45 50 pct_direct_no…
## 5 4914 106137 11799 117936 0.100 0.0417 1.40 40 pct_et0_fao_e…
## 6 4914 109462 8474 117936 0.0719 0.0417 0.724 60 pct_relativeh…
## 7 4914 109962 7974 117936 0.0676 0.0417 0.623 79 pct_vapor_pre…
## 8 4914 110201 7735 117936 0.0656 0.0417 0.574 101 pct_soil_temp…
## 9 4914 110597 7339 117936 0.0622 0.0417 0.493 97 pct_windgusts…
## 10 4914 110778 7158 117936 0.0607 0.0417 0.457 101 pct_windspeed…
## 11 4914 110854 7082 117936 0.0600 0.0417 0.441 101 pct_temperatu…
## 12 4914 110941 6995 117936 0.0593 0.0417 0.423 101 pct_winddirec…
## 13 4914 110958 6978 117936 0.0592 0.0417 0.420 101 pct_windspeed…
## 14 4914 110983 6953 117936 0.0590 0.0417 0.415 101 pct_apparent_…
## 15 4914 111081 6855 117936 0.0581 0.0417 0.395 101 pct_winddirec…
## 16 4914 111522 6414 117936 0.0544 0.0417 0.305 101 pct_pressure_…
## 17 4914 111548 6388 117936 0.0542 0.0417 0.300 101 pct_surface_p…
## 18 4914 111600 6336 117936 0.0537 0.0417 0.289 64 pct_cloudcover
## 19 4914 111643 6293 117936 0.0534 0.0417 0.281 101 pct_dewpoint_…
## 20 4914 111654 6282 117936 0.0533 0.0417 0.278 101 rnd100
## 21 4914 111667 6269 117936 0.0532 0.0417 0.276 101 pct_soil_temp…
## 22 4914 111831 6105 117936 0.0518 0.0417 0.242 46 pct_cloudcove…
## 23 4914 111846 6090 117936 0.0516 0.0417 0.239 50 pct_cloudcove…
## 24 4914 111908 6028 117936 0.0511 0.0417 0.227 101 pct_soil_mois…
## 25 4914 112038 5898 117936 0.0500 0.0417 0.200 100 pct_soil_mois…
## 26 4914 112197 5739 117936 0.0487 0.0417 0.168 12 pct_weatherco…
## 27 4914 112274 5662 117936 0.0480 0.0417 0.152 41 pct_cloudcove…
## 28 4914 112337 5599 117936 0.0475 0.0417 0.139 26 rnd025
## 29 4914 112537 5399 117936 0.0458 0.0417 0.0987 96 pct_soil_mois…
## 30 4914 112566 5370 117936 0.0455 0.0417 0.0928 101 pct_soil_temp…
## 31 4914 112687 5249 117936 0.0445 0.0417 0.0682 12 pct_rain
## 32 4914 112691 5245 117936 0.0445 0.0417 0.0674 13 pct_precipita…
## 33 4914 112702 5234 117936 0.0444 0.0417 0.0651 101 pct_soil_temp…
## 34 4914 112711 5225 117936 0.0443 0.0417 0.0633 6 rnd005
## 35 4914 112803 5133 117936 0.0435 0.0417 0.0446 66 pct_soil_mois…
## 36 4914 112966 4970 117936 0.0421 0.0417 0.0114 5 pct_snowfall
tmpDFRHour_v2 %>%
mutate(fillColor=ifelse(str_detect(vrbl, pattern="pct_"), "lightblue", "red")) %>%
ggplot(aes(x=fct_reorder(stringr::str_replace_all(vrbl, "pct_", ""), lift), y=lift)) +
geom_col(aes(fill=fillColor)) +
coord_flip() +
labs(x=NULL, y="lift", title="Lift by hourly variable percentile in predicting hour") +
scale_fill_identity()
Function simpleOneVarPredict() is updated to allow for test-train:
simpleOneVarPredict <- function(df,
tgt,
prd,
dfTest=NULL,
nPrint=30,
showPlot=TRUE,
returnData=TRUE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble with key elements (training data set)
# tgt: target variable
# prd: predictor variable
# dfTest: test dataset for applying predictions
# nPrint: maximum number of lines of confusion matrix to print
# 0 means do not print any summary statistics
# showPlot: boolean, should overlap plot be created and shown?
# Counts of predictor to target variable
dfPred <- df %>%
group_by(across(all_of(c(prd, tgt)))) %>%
summarize(n=n(), .groups="drop") %>%
arrange(across(all_of(prd)), desc(n)) %>%
group_by(across(all_of(prd))) %>%
mutate(correct=row_number()==1, predicted=first(get(tgt))) %>%
ungroup()
# Confusion matrix and accuracy
dfConf <- dfPred %>%
group_by(across(all_of(c(tgt, "correct")))) %>%
summarize(n=sum(n), .groups="drop") %>%
pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
mutate(n=`TRUE`+`FALSE`,
pctCorrect=`TRUE`/n,
pctNaive=1/(nrow(.)),
lift=pctCorrect/pctNaive-1
)
# Overall confusion matrix
dfConfAll <- dfConf %>%
summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
mutate(pctCorrect=`TRUE`/n,
pctNaive=nMax/n,
lift=pctCorrect/pctNaive-1,
nBucket=length(unique(dfPred[[prd]]))
)
# Print confusion matrices
if(nPrint > 0) {
cat("\nAccuracy by target subgroup (training data):\n")
dfConf %>% print(n=nPrint)
cat("\nOverall Accuracy (training data):\n")
dfConfAll %>% print(n=nPrint)
}
# Plot of overlaps
if(isTRUE(showPlot)) {
p1 <- dfPred %>%
group_by(across(c(all_of(tgt), "predicted", "correct"))) %>%
summarize(n=sum(n), .groups="drop") %>%
ggplot(aes(x=get(tgt), y=predicted)) +
labs(x="Actual",
y="Predicted",
title=paste0("Training data - Actual vs. predicted ", tgt),
subtitle=paste0("(using ", prd, ")")
) +
geom_text(aes(label=n)) +
geom_tile(aes(fill=correct), alpha=0.25)
print(p1)
}
# Create metrics for test dataset if requested
if(!is.null(dfTest)) {
# Get maximum category from training data
mostPredicted <- count(dfPred, predicted, wt=n) %>% slice(1) %>% pull(predicted)
# Get mapping of metric to prediction
dfPredict <- dfPred %>%
group_by(across(all_of(c(prd, "predicted")))) %>%
summarize(n=sum(n), .groups="drop")
# Create predictions for test data
dfPredTest <- dfTest %>%
select(all_of(c(prd, tgt))) %>%
left_join(select(dfPredict, -n)) %>%
replace_na(list(predicted=mostPredicted)) %>%
group_by(across(all_of(c(prd, tgt, "predicted")))) %>%
summarize(n=n(), .groups="drop") %>%
mutate(correct=(get(tgt)==predicted))
# Create confusion statistics for test data
dfConfTest <- dfPredTest %>%
group_by(across(all_of(c(tgt, "correct")))) %>%
summarize(n=sum(n), .groups="drop") %>%
pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
mutate(n=`TRUE`+`FALSE`,
pctCorrect=`TRUE`/n,
pctNaive=1/(nrow(.)),
lift=pctCorrect/pctNaive-1
)
# Overall confusion matrix for test data
dfConfAllTest <- dfConfTest %>%
summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
mutate(pctCorrect=`TRUE`/n,
pctNaive=nMax/n,
lift=pctCorrect/pctNaive-1,
nBucket=length(unique(dfConfTest[[prd]]))
)
# Print confusion matrices
if(nPrint > 0) {
cat("\nAccuracy by target subgroup (testing data):\n")
dfConfTest %>% print(n=nPrint)
cat("\nOverall Accuracy (testing data):\n")
dfConfAllTest %>% print(n=nPrint)
}
} else {
dfPredTest <- NULL
dfConfTest <- NULL
dfConfAllTest <- NULL
}
# Return data if requested
if(isTRUE(returnData)) list(dfPred=dfPred,
dfConf=dfConf,
dfConfAll=dfConfAll,
dfPredTest=dfPredTest,
dfConfTest=dfConfTest,
dfConfAllTest=dfConfAllTest
)
}
# Original format
simpleOneVarPredict(tmpTemp, tgt="month", prd="pct_soil_temperature_100_to_255cm", showPlot=FALSE)
##
## Accuracy by target subgroup (training data):
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5395 5021 10416 0.482 0.0833 4.78
## 2 Feb 5094 4386 9480 0.463 0.0833 4.55
## 3 Mar 3039 7377 10416 0.708 0.0833 7.50
## 4 Apr 8862 1218 10080 0.121 0.0833 0.45
## 5 May 7343 3073 10416 0.295 0.0833 2.54
## 6 Jun 5365 4355 9720 0.448 0.0833 4.38
## 7 Jul 4445 5227 9672 0.540 0.0833 5.49
## 8 Aug 4762 4910 9672 0.508 0.0833 5.09
## 9 Sep 2744 6616 9360 0.707 0.0833 7.48
## 10 Oct 6858 2814 9672 0.291 0.0833 2.49
## 11 Nov 5822 3538 9360 0.378 0.0833 3.54
## 12 Dec 5030 4642 9672 0.480 0.0833 4.76
##
## Overall Accuracy (training data):
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 64759 53177 117936 0.451 0.0883 4.11 101
## $dfPred
## # A tibble: 402 × 5
## pct_soil_temperature_100_to_255cm month n correct predicted
## <dbl> <fct> <int> <lgl> <fct>
## 1 0 Apr 409 TRUE Apr
## 2 0 Mar 260 FALSE Apr
## 3 1 Mar 1162 TRUE Mar
## 4 1 Apr 262 FALSE Mar
## 5 2 Mar 1265 TRUE Mar
## 6 2 Apr 257 FALSE Mar
## 7 3 Mar 354 TRUE Mar
## 8 3 Apr 181 FALSE Mar
## 9 4 Mar 1021 TRUE Mar
## 10 4 Apr 768 FALSE Mar
## # … with 392 more rows
##
## $dfConf
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5395 5021 10416 0.482 0.0833 4.78
## 2 Feb 5094 4386 9480 0.463 0.0833 4.55
## 3 Mar 3039 7377 10416 0.708 0.0833 7.50
## 4 Apr 8862 1218 10080 0.121 0.0833 0.45
## 5 May 7343 3073 10416 0.295 0.0833 2.54
## 6 Jun 5365 4355 9720 0.448 0.0833 4.38
## 7 Jul 4445 5227 9672 0.540 0.0833 5.49
## 8 Aug 4762 4910 9672 0.508 0.0833 5.09
## 9 Sep 2744 6616 9360 0.707 0.0833 7.48
## 10 Oct 6858 2814 9672 0.291 0.0833 2.49
## 11 Nov 5822 3538 9360 0.378 0.0833 3.54
## 12 Dec 5030 4642 9672 0.480 0.0833 4.76
##
## $dfConfAll
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 64759 53177 117936 0.451 0.0883 4.11 101
##
## $dfPredTest
## NULL
##
## $dfConfTest
## NULL
##
## $dfConfAllTest
## NULL
# Train-test format
set.seed(23072514)
idxTrain <- sample(1:nrow(tmpTemp), size=round(.8*nrow(tmpTemp)), replace=FALSE)
simpleOneVarPredict(tmpTemp[idxTrain,],
tgt="month",
prd="pct_soil_temperature_100_to_255cm",
showPlot=FALSE,
dfTest=tmpTemp[-idxTrain,]
)
##
## Accuracy by target subgroup (training data):
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 4333 4004 8337 0.480 0.0833 4.76
## 2 Feb 4054 3481 7535 0.462 0.0833 4.54
## 3 Mar 2234 6077 8311 0.731 0.0833 7.77
## 4 Apr 7239 810 8049 0.101 0.0833 0.208
## 5 May 5903 2464 8367 0.294 0.0833 2.53
## 6 Jun 4054 3793 7847 0.483 0.0833 4.80
## 7 Jul 3673 4083 7756 0.526 0.0833 5.32
## 8 Aug 3789 3923 7712 0.509 0.0833 5.10
## 9 Sep 2176 5300 7476 0.709 0.0833 7.51
## 10 Oct 5488 2236 7724 0.289 0.0833 2.47
## 11 Nov 4148 3330 7478 0.445 0.0833 4.34
## 12 Dec 4700 3057 7757 0.394 0.0833 3.73
##
## Overall Accuracy (training data):
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 8367 51791 42558 94349 0.451 0.0887 4.09 101
## Joining with `by = join_by(pct_soil_temperature_100_to_255cm)`
##
## Accuracy by target subgroup (testing data):
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 1068 1011 2079 0.486 0.0833 4.84
## 2 Feb 1040 905 1945 0.465 0.0833 4.58
## 3 Mar 577 1528 2105 0.726 0.0833 7.71
## 4 Apr 1858 173 2031 0.0852 0.0833 0.0222
## 5 May 1450 599 2049 0.292 0.0833 2.51
## 6 Jun 982 891 1873 0.476 0.0833 4.71
## 7 Jul 960 956 1916 0.499 0.0833 4.99
## 8 Aug 973 987 1960 0.504 0.0833 5.04
## 9 Sep 568 1316 1884 0.699 0.0833 7.38
## 10 Oct 1370 578 1948 0.297 0.0833 2.56
## 11 Nov 1028 854 1882 0.454 0.0833 4.45
## 12 Dec 1148 767 1915 0.401 0.0833 3.81
##
## Overall Accuracy (testing data):
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 2105 13022 10565 23587 0.448 0.0892 4.02 0
## $dfPred
## # A tibble: 402 × 5
## pct_soil_temperature_100_to_255cm month n correct predicted
## <dbl> <fct> <int> <lgl> <fct>
## 1 0 Apr 331 TRUE Apr
## 2 0 Mar 212 FALSE Apr
## 3 1 Mar 923 TRUE Mar
## 4 1 Apr 209 FALSE Mar
## 5 2 Mar 993 TRUE Mar
## 6 2 Apr 198 FALSE Mar
## 7 3 Mar 278 TRUE Mar
## 8 3 Apr 137 FALSE Mar
## 9 4 Mar 811 TRUE Mar
## 10 4 Apr 630 FALSE Mar
## # … with 392 more rows
##
## $dfConf
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 4333 4004 8337 0.480 0.0833 4.76
## 2 Feb 4054 3481 7535 0.462 0.0833 4.54
## 3 Mar 2234 6077 8311 0.731 0.0833 7.77
## 4 Apr 7239 810 8049 0.101 0.0833 0.208
## 5 May 5903 2464 8367 0.294 0.0833 2.53
## 6 Jun 4054 3793 7847 0.483 0.0833 4.80
## 7 Jul 3673 4083 7756 0.526 0.0833 5.32
## 8 Aug 3789 3923 7712 0.509 0.0833 5.10
## 9 Sep 2176 5300 7476 0.709 0.0833 7.51
## 10 Oct 5488 2236 7724 0.289 0.0833 2.47
## 11 Nov 4148 3330 7478 0.445 0.0833 4.34
## 12 Dec 4700 3057 7757 0.394 0.0833 3.73
##
## $dfConfAll
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 8367 51791 42558 94349 0.451 0.0887 4.09 101
##
## $dfPredTest
## # A tibble: 400 × 5
## pct_soil_temperature_100_to_255cm month predicted n correct
## <dbl> <fct> <fct> <int> <lgl>
## 1 0 Mar Apr 48 FALSE
## 2 0 Apr Apr 78 TRUE
## 3 1 Mar Mar 239 TRUE
## 4 1 Apr Mar 53 FALSE
## 5 2 Mar Mar 272 TRUE
## 6 2 Apr Mar 59 FALSE
## 7 3 Mar Mar 76 TRUE
## 8 3 Apr Mar 44 FALSE
## 9 4 Feb Mar 17 FALSE
## 10 4 Mar Mar 210 TRUE
## # … with 390 more rows
##
## $dfConfTest
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 1068 1011 2079 0.486 0.0833 4.84
## 2 Feb 1040 905 1945 0.465 0.0833 4.58
## 3 Mar 577 1528 2105 0.726 0.0833 7.71
## 4 Apr 1858 173 2031 0.0852 0.0833 0.0222
## 5 May 1450 599 2049 0.292 0.0833 2.51
## 6 Jun 982 891 1873 0.476 0.0833 4.71
## 7 Jul 960 956 1916 0.499 0.0833 4.99
## 8 Aug 973 987 1960 0.504 0.0833 5.04
## 9 Sep 568 1316 1884 0.699 0.0833 7.38
## 10 Oct 1370 578 1948 0.297 0.0833 2.56
## 11 Nov 1028 854 1882 0.454 0.0833 4.45
## 12 Dec 1148 767 1915 0.401 0.0833 3.81
##
## $dfConfAllTest
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 2105 13022 10565 23587 0.448 0.0892 4.02 0
The function is split into components for better modularity:
# Fit a single predictor to a single categorical variable
simpleOneVarFit <- function(df,
tgt,
prd,
rankType="last",
naMethod=TRUE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble with key elements (training data set)
# tgt: target variable
# prd: predictor variable
# rankType: method for breaking ties of same n, passed to base::rank as ties.method=
# naMethod: method for handling NA in ranks, passed to base::rank as na.last=
# Counts of predictor to target variable, and associated predictions
df %>%
group_by(across(all_of(c(prd, tgt)))) %>%
summarize(n=n(), .groups="drop") %>%
arrange(across(all_of(prd)), desc(n), across(all_of(tgt))) %>%
group_by(across(all_of(prd))) %>%
mutate(rankN=n()+1-rank(n, ties.method=rankType, na.last=naMethod)) %>%
arrange(across(all_of(prd)), rankN) %>%
ungroup()
}
# Test that results are the same for a variable with many ties, and a variable with fewer ties
tstFit <- simpleOneVarFit(tmpTemp, tgt="month", prd="pct_snowfall")
tstOrig <- simpleOneVarPredict(tmpTemp, tgt="month", prd="pct_snowfall", nPrint=0, showPlot=FALSE)$dfPred
all.equal(tstOrig %>% select(-correct, -predicted), tstFit %>% select(-rankN))
## [1] TRUE
tstFit <- simpleOneVarFit(tmpTemp, tgt="month", prd="pct_soil_temperature_100_to_255cm")
tstOrig <- simpleOneVarPredict(tmpTemp,
tgt="month",
prd="pct_soil_temperature_100_to_255cm",
nPrint=0,
showPlot=FALSE
)$dfPred
all.equal(tstOrig %>% select(-correct, -predicted), tstFit %>% select(-rankN))
## [1] TRUE
A prediction mapper is created, along with the mapping for anything not in the data:
simpleOneVarMapper <- function(df, tgt, prd) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble from SimpleOneVarFit()
# tgt: target variable
# prd: predictor variable
# Get the most common actual results
dfCommon <- df %>% count(across(all_of(tgt)), wt=n, sort=TRUE)
# Get the predictions
dfPredictor <- df %>%
group_by(across(all_of(prd))) %>%
filter(row_number()==1) %>%
select(all_of(c(prd, tgt))) %>%
ungroup()
list(dfPredictor=dfPredictor, dfCommon=dfCommon)
}
tstMapper <- simpleOneVarMapper(tstFit, tgt="month", prd="pct_soil_temperature_100_to_255cm")
tstMapper
## $dfPredictor
## # A tibble: 101 × 2
## pct_soil_temperature_100_to_255cm month
## <dbl> <fct>
## 1 0 Apr
## 2 1 Mar
## 3 2 Mar
## 4 3 Mar
## 5 4 Mar
## 6 5 Apr
## 7 6 Mar
## 8 7 Mar
## 9 8 Mar
## 10 9 Mar
## # … with 91 more rows
##
## $dfCommon
## # A tibble: 12 × 2
## month n
## <fct> <int>
## 1 Jan 10416
## 2 Mar 10416
## 3 May 10416
## 4 Apr 10080
## 5 Jun 9720
## 6 Jul 9672
## 7 Aug 9672
## 8 Oct 9672
## 9 Dec 9672
## 10 Feb 9480
## 11 Sep 9360
## 12 Nov 9360
A function to apply the prediction mapper is created:
simpleOneVarApplyMapper <- function(df,
tgt,
prd,
mapper,
mapperDF="dfPredictor",
mapperDefault="dfCommon",
prdName="predicted"
) {
# FUNCTION ARGUMENTS:
# df: data frame containing prd for predicting tgt
# tgt: target variable in df
# prd: predictor variable in df
# mapper: mapping list from sinpleOneVarMapper()
# mapperDF: element that can be used to merge mappings
# mapperDefault: element that can be used for NA resulting from merging mapperDF
# prdName: name for the prediction variable
# Extract the mapper and default value
vecRename <- c(prdName) %>% purrr::set_names(tgt)
dfMap <- mapper[[mapperDF]] %>% select(all_of(c(prd, tgt))) %>% colRenamer(vecRename=vecRename)
chrDefault <- mapper[[mapperDefault]] %>% slice(1) %>% pull(tgt)
# Merge mappings to df
df %>%
left_join(dfMap, by=prd) %>%
replace_na(list("predicted"=chrDefault))
}
# Example with mutated variable
tmpMutated <- tmpTemp %>%
select(date, hour, month, pct_soil_temperature_100_to_255cm) %>%
mutate(pct_soil_temperature_100_to_255cm=ifelse(hour==0, -10, pct_soil_temperature_100_to_255cm))
tstApplied <- simpleOneVarApplyMapper(tmpMutated,
tgt="month",
prd="pct_soil_temperature_100_to_255cm",
mapper=tstMapper
)
tstApplied
## # A tibble: 117,936 × 5
## date hour month pct_soil_temperature_100_to_255cm predicted
## <date> <int> <fct> <dbl> <fct>
## 1 2010-01-01 0 Jan -10 Jan
## 2 2010-01-01 1 Jan 41 Jan
## 3 2010-01-01 2 Jan 41 Jan
## 4 2010-01-01 3 Jan 41 Jan
## 5 2010-01-01 4 Jan 40 Jan
## 6 2010-01-01 5 Jan 40 Jan
## 7 2010-01-01 6 Jan 40 Jan
## 8 2010-01-01 7 Jan 40 Jan
## 9 2010-01-01 8 Jan 40 Jan
## 10 2010-01-01 9 Jan 40 Jan
## # … with 117,926 more rows
# Example using tstFit to confirm same outputs
tstApplied <- simpleOneVarApplyMapper(tstFit,
tgt="month",
prd="pct_soil_temperature_100_to_255cm",
mapper=tstMapper
)
tstApplied
## # A tibble: 402 × 5
## pct_soil_temperature_100_to_255cm month n rankN predicted
## <dbl> <fct> <int> <dbl> <fct>
## 1 0 Apr 409 1 Apr
## 2 0 Mar 260 2 Apr
## 3 1 Mar 1162 1 Mar
## 4 1 Apr 262 2 Mar
## 5 2 Mar 1265 1 Mar
## 6 2 Apr 257 2 Mar
## 7 3 Mar 354 1 Mar
## 8 3 Apr 181 2 Mar
## 9 4 Mar 1021 1 Mar
## 10 4 Apr 768 2 Mar
## # … with 392 more rows
all.equal(tstOrig %>% select(-correct, correct),
tstApplied %>% select(-rankN) %>% mutate(correct=month==predicted)
)
## [1] TRUE
A function to create the confusion matrix data is written:
simpleOneVarConfusionData <- function(df,
tgtOrig,
tgtPred,
otherVars=c(),
weightBy="n"
) {
# FUNCTION ARGUMENTS:
# df: data frame from simpleOneVarApplyMapper()
# tgtOrig: original target variable name in df
# tgtPred: predicted target variable name in df
# otherVars: other variables to be kept (will be grouping variables)
# weightBy: weighting variable for counts in df (NULL means count each row of df as 1)
# Confusion matrix data creation
df %>%
group_by(across(all_of(c(tgtOrig, tgtPred, otherVars)))) %>%
summarize(n=if(!is.null(weightBy)) sum(get(weightBy)) else n(), .groups="drop") %>%
mutate(correct=get(tgtOrig)==get(tgtPred))
}
# Example with and without weighting
simpleOneVarConfusionData(tstApplied, tgtOrig="month", tgtPred="predicted", weightBy=NULL)
## # A tibble: 61 × 4
## month predicted n correct
## <fct> <fct> <int> <lgl>
## 1 Jan Jan 10 TRUE
## 2 Jan Feb 6 FALSE
## 3 Jan Mar 2 FALSE
## 4 Jan Apr 1 FALSE
## 5 Jan May 7 FALSE
## 6 Jan Jun 8 FALSE
## 7 Jan Dec 1 FALSE
## 8 Feb Jan 1 FALSE
## 9 Feb Feb 11 TRUE
## 10 Feb Mar 9 FALSE
## # … with 51 more rows
simpleOneVarConfusionData(tstApplied, tgtOrig="month", tgtPred="predicted")
## # A tibble: 61 × 4
## month predicted n correct
## <fct> <fct> <int> <lgl>
## 1 Jan Jan 5021 TRUE
## 2 Jan Feb 921 FALSE
## 3 Jan Mar 107 FALSE
## 4 Jan Apr 195 FALSE
## 5 Jan May 2619 FALSE
## 6 Jan Jun 1534 FALSE
## 7 Jan Dec 19 FALSE
## 8 Feb Jan 29 FALSE
## 9 Feb Feb 4386 TRUE
## 10 Feb Mar 3145 FALSE
## # … with 51 more rows
A function to report the confusion matrix data is written:
simpleOneVarConfusionReport <- function(df,
tgtOrig,
tgtPred,
otherVars=c(),
printConf=TRUE,
printConfOrig=printConf,
printConfPred=printConf,
printConfOverall=printConf,
plotConf=TRUE,
plotDesc="",
nBucket=NA,
predictorVarName="",
returnData=FALSE
) {
# FUNCTION ARGUMENTS:
# df: data frame from simpleOneVarConfusionData()
# tgtOrig: original target variable name in df
# tgtPred: predicted target variable name in df
# otherVars: other variables to be kept (will be grouping variables) - NOT IMPLEMENTED
# printConf: boolean, should confusion matrix data be printed? Applies to all three
# printConfOrig: boolean, should confusion data be printed based on original target variable?
# printConfPred: boolean, should confusion data be printed based on predicted target variable?
# printConfOverall: boolean, should overall confusion data be printed?
# plotConf: boolean, should confusion overlap data be plotted?
# plotDesc: descriptive label to be included in front of plot title
# nBucket: number of buckets used for prediction (pass from previous data)
# predictorVarName: variable name to be included in chart description
# returnData: boolean, should the confusion matrices be returned?
# Confusion data based on original target variable
if(isTRUE(printConfOrig) | isTRUE(returnData)) {
dfConfOrig <- df %>%
group_by(across(all_of(c(tgtOrig)))) %>%
summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
mutate(pctRight=right/n, pctNaive=n/(sum(n)), lift=pctRight/pctNaive-1)
}
# Confusion data based on predicted target variable
if(isTRUE(printConfPred) | isTRUE(returnData)) {
dfConfPred <- df %>%
group_by(across(all_of(c(tgtPred)))) %>%
summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
mutate(pctRight=right/n)
}
# Overall confusion data
if(isTRUE(printConfOverall) | isTRUE(returnData)) {
maxNaive <- df %>%
group_by(across(all_of(tgtOrig))) %>%
summarize(n=sum(n), .groups="drop") %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(n)
dfConfOverall <- df %>%
summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
mutate(maxN=maxNaive, pctRight=right/n, pctNaive=maxN/n, lift=pctRight/pctNaive-1, nBucket=nBucket)
}
# Confusion report based on original target variable
if(isTRUE(printConfOrig)) {
cat("\nConfusion data based on original target variable:", tgtOrig, "\n")
dfConfOrig %>%
print(n=50)
}
# Confusion report based on predicted target variable
if(isTRUE(printConfPred)) {
cat("\nConfusion data based on predicted target variable:", tgtPred, "\n")
dfConfPred %>%
print(n=50)
}
# Overall confusion matrix
if(isTRUE(printConfOverall)) {
cat("\nOverall confusion matrix\n")
dfConfOverall %>%
print(n=50)
}
# Plot of overlaps
if(isTRUE(plotConf)) {
p1 <- df %>%
group_by(across(all_of(c(tgtOrig, tgtPred, "correct")))) %>%
summarize(n=sum(n), .groups="drop") %>%
ggplot(aes(x=get(tgtOrig), y=get(tgtPred))) +
labs(x="Actual",
y="Predicted",
title=paste0(plotDesc, "Actual vs. predicted ", tgtOrig),
subtitle=paste0("(using ", predictorVarName, ")")
) +
geom_text(aes(label=n)) +
geom_tile(aes(fill=correct), alpha=0.25)
print(p1)
}
# Return data if requested
if(isTRUE(returnData)) list(dfConfOrig=dfConfOrig, dfConfPred=dfConfPred, dfConfOverall=dfConfOverall)
}
# Example with weighting
simpleOneVarConfusionData(tstApplied, tgtOrig="month", tgtPred="predicted") %>%
simpleOneVarConfusionReport(tgtOrig="month",
tgtPred="predicted",
nBucket=length(unique(tstApplied$pct_soil_temperature_100_to_255cm)),
predictorVarName=names(tstApplied)[1]
)
##
## Confusion data based on original target variable: month
## # A tibble: 12 × 7
## month right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5021 5395 10416 0.482 0.0883 4.46
## 2 Feb 4386 5094 9480 0.463 0.0804 4.76
## 3 Mar 7377 3039 10416 0.708 0.0883 7.02
## 4 Apr 1218 8862 10080 0.121 0.0855 0.414
## 5 May 3073 7343 10416 0.295 0.0883 2.34
## 6 Jun 4355 5365 9720 0.448 0.0824 4.44
## 7 Jul 5227 4445 9672 0.540 0.0820 5.59
## 8 Aug 4910 4762 9672 0.508 0.0820 5.19
## 9 Sep 6616 2744 9360 0.707 0.0794 7.91
## 10 Oct 2814 6858 9672 0.291 0.0820 2.55
## 11 Nov 3538 5822 9360 0.378 0.0794 3.76
## 12 Dec 4642 5030 9672 0.480 0.0820 4.85
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 12 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Jan 5021 6808 11829 0.424
## 2 Feb 4386 7504 11890 0.369
## 3 Mar 7377 7928 15305 0.482
## 4 Apr 1218 1911 3129 0.389
## 5 May 3073 5258 8331 0.369
## 6 Jun 4355 5006 9361 0.465
## 7 Jul 5227 5529 10756 0.486
## 8 Aug 4910 6218 11128 0.441
## 9 Sep 6616 5371 11987 0.552
## 10 Oct 2814 2878 5692 0.494
## 11 Nov 3538 4421 7959 0.445
## 12 Dec 4642 5927 10569 0.439
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 53177 64759 117936 10416 0.451 0.0883 4.11 101
# Example with weighting and data returned without reporting
simpleOneVarConfusionData(tstApplied, tgtOrig="month", tgtPred="predicted") %>%
simpleOneVarConfusionReport(tgtOrig="month",
tgtPred="predicted",
nBucket=length(unique(tstApplied$pct_soil_temperature_100_to_255cm)),
predictorVarName=names(tstApplied)[1],
printConf=FALSE,
plotConf=FALSE,
returnData=TRUE
)
## $dfConfOrig
## # A tibble: 12 × 7
## month right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5021 5395 10416 0.482 0.0883 4.46
## 2 Feb 4386 5094 9480 0.463 0.0804 4.76
## 3 Mar 7377 3039 10416 0.708 0.0883 7.02
## 4 Apr 1218 8862 10080 0.121 0.0855 0.414
## 5 May 3073 7343 10416 0.295 0.0883 2.34
## 6 Jun 4355 5365 9720 0.448 0.0824 4.44
## 7 Jul 5227 4445 9672 0.540 0.0820 5.59
## 8 Aug 4910 4762 9672 0.508 0.0820 5.19
## 9 Sep 6616 2744 9360 0.707 0.0794 7.91
## 10 Oct 2814 6858 9672 0.291 0.0820 2.55
## 11 Nov 3538 5822 9360 0.378 0.0794 3.76
## 12 Dec 4642 5030 9672 0.480 0.0820 4.85
##
## $dfConfPred
## # A tibble: 12 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Jan 5021 6808 11829 0.424
## 2 Feb 4386 7504 11890 0.369
## 3 Mar 7377 7928 15305 0.482
## 4 Apr 1218 1911 3129 0.389
## 5 May 3073 5258 8331 0.369
## 6 Jun 4355 5006 9361 0.465
## 7 Jul 5227 5529 10756 0.486
## 8 Aug 4910 6218 11128 0.441
## 9 Sep 6616 5371 11987 0.552
## 10 Oct 2814 2878 5692 0.494
## 11 Nov 3538 4421 7959 0.445
## 12 Dec 4642 5927 10569 0.439
##
## $dfConfOverall
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 53177 64759 117936 10416 0.451 0.0883 4.11 101
The functions are run together:
# Create fitting data
dfFit <- simpleOneVarFit(tmpTemp, tgt="month", prd="pct_temperature_2m")
# Apply and report on fitting data
dfFit %>%
simpleOneVarMapper(tgt="month", prd="pct_temperature_2m") %>%
simpleOneVarApplyMapper(df=dfFit, tgt="month", prd="pct_temperature_2m", mapper=.) %>%
simpleOneVarConfusionData(tgtOrig="month", tgtPred="predicted") %>%
simpleOneVarConfusionReport(tgtOrig="month",
tgtPred="predicted",
nBucket=length(unique(dfFit$pct_temperature_2m))
)
##
## Confusion data based on original target variable: month
## # A tibble: 12 × 7
## month right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 6599 3817 10416 0.634 0.0883 6.17
## 2 Feb 849 8631 9480 0.0896 0.0804 0.114
## 3 Mar 2052 8364 10416 0.197 0.0883 1.23
## 4 Apr 3414 6666 10080 0.339 0.0855 2.96
## 5 May 1534 8882 10416 0.147 0.0883 0.668
## 6 Jun 1227 8493 9720 0.126 0.0824 0.532
## 7 Jul 3279 6393 9672 0.339 0.0820 3.13
## 8 Aug 5016 4656 9672 0.519 0.0820 5.32
## 9 Sep 2601 6759 9360 0.278 0.0794 2.50
## 10 Oct 2594 7078 9672 0.268 0.0820 2.27
## 11 Nov 0 9360 9360 0 0.0794 -1
## 12 Dec 1774 7898 9672 0.183 0.0820 1.24
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 11 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Jan 6599 14465 21064 0.313
## 2 Feb 849 1608 2457 0.346
## 3 Mar 2052 7459 9511 0.216
## 4 Apr 3414 10804 14218 0.240
## 5 May 1534 5517 7051 0.218
## 6 Jun 1227 4759 5986 0.205
## 7 Jul 3279 5675 8954 0.366
## 8 Aug 5016 13331 18347 0.273
## 9 Sep 2601 9486 12087 0.215
## 10 Oct 2594 8115 10709 0.242
## 11 Dec 1774 5778 7552 0.235
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 30939 86997 117936 10416 0.262 0.0883 1.97 101
A function is written to chain all of the functions:
simpleOneVarChain <- function(df,
tgt,
prd,
mapper=NULL,
rankType="last",
naMethod=TRUE,
printReport=TRUE,
plotDesc="",
returnData=TRUE,
includeConfData=FALSE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble with key elements (training or testing data set)
# tgt: target variable
# prd: predictor variable
# mapper: mapping file to be applied for predictions (NULL means create from simpleOneVarApply())
# rankType: method for breaking ties of same n, passed to base::rank as ties.method=
# naMethod: method for handling NA in ranks, passed to base::rank as na.last=
# printReport: boolean, should the confusion report data and plot be printed?
# plotDesc: descriptive label to be included in front of plot title
# returnData: boolean, should data elements be returned?
# includeConfData: boolean, should confusion data be returned?
# Create the summary of predictor-target-n
dfFit <- simpleOneVarFit(df, tgt=tgt, prd=prd, rankType=rankType, naMethod=naMethod)
# Create the mapper if it does not already exist
if(is.null(mapper)) mapper <- simpleOneVarMapper(dfFit, tgt=tgt, prd=prd)
# Apply mapper to data
dfApplied <- simpleOneVarApplyMapper(dfFit, tgt=tgt, prd=prd, mapper=mapper)
# Create confusion data
dfConfusion <- simpleOneVarConfusionData(dfApplied, tgtOrig=tgt, tgtPred="predicted")
# Create confusion report if requested
if(isTRUE(printReport) | isTRUE(includeConfData)) {
dfConfReport <- simpleOneVarConfusionReport(df=dfConfusion,
tgtOrig=tgt,
tgtPred="predicted",
nBucket=length(unique(dfApplied[[prd]])),
predictorVarName=prd,
printConf=printReport,
plotConf=printReport,
plotDesc=plotDesc,
returnData=includeConfData
)
}
# Return data if requested
if(isTRUE(returnData)) {
ret <- list(dfFit=dfFit, mapper=mapper, dfApplied=dfApplied, dfConfusion=dfConfusion)
if(isTRUE(includeConfData)) ret<-c(ret, list(dfConfData=dfConfReport))
ret
}
}
# Full process
tmpChain <- simpleOneVarChain(tmpTemp, tgt="month", prd="pct_temperature_2m")
##
## Confusion data based on original target variable: month
## # A tibble: 12 × 7
## month right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 6599 3817 10416 0.634 0.0883 6.17
## 2 Feb 849 8631 9480 0.0896 0.0804 0.114
## 3 Mar 2052 8364 10416 0.197 0.0883 1.23
## 4 Apr 3414 6666 10080 0.339 0.0855 2.96
## 5 May 1534 8882 10416 0.147 0.0883 0.668
## 6 Jun 1227 8493 9720 0.126 0.0824 0.532
## 7 Jul 3279 6393 9672 0.339 0.0820 3.13
## 8 Aug 5016 4656 9672 0.519 0.0820 5.32
## 9 Sep 2601 6759 9360 0.278 0.0794 2.50
## 10 Oct 2594 7078 9672 0.268 0.0820 2.27
## 11 Nov 0 9360 9360 0 0.0794 -1
## 12 Dec 1774 7898 9672 0.183 0.0820 1.24
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 11 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Jan 6599 14465 21064 0.313
## 2 Feb 849 1608 2457 0.346
## 3 Mar 2052 7459 9511 0.216
## 4 Apr 3414 10804 14218 0.240
## 5 May 1534 5517 7051 0.218
## 6 Jun 1227 4759 5986 0.205
## 7 Jul 3279 5675 8954 0.366
## 8 Aug 5016 13331 18347 0.273
## 9 Sep 2601 9486 12087 0.215
## 10 Oct 2594 8115 10709 0.242
## 11 Dec 1774 5778 7552 0.235
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 30939 86997 117936 10416 0.262 0.0883 1.97 101
str(tmpChain)
## List of 4
## $ dfFit : tibble [840 × 4] (S3: tbl_df/tbl/data.frame)
## ..$ pct_temperature_2m: num [1:840] 0 0 0 0 1 1 1 1 2 2 ...
## ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 12 3 1 2 12 3 1 2 ...
## ..$ n : int [1:840] 309 204 66 16 515 480 184 43 484 415 ...
## ..$ rankN : num [1:840] 1 2 3 4 1 2 3 4 1 2 ...
## $ mapper :List of 2
## ..$ dfPredictor: tibble [101 × 2] (S3: tbl_df/tbl/data.frame)
## .. ..$ pct_temperature_2m: num [1:101] 0 1 2 3 4 5 6 7 8 9 ...
## .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 2 1 1 2 1 ...
## ..$ dfCommon : tibble [12 × 2] (S3: tbl_df/tbl/data.frame)
## .. ..$ month: Factor w/ 12 levels "Jan","Feb","Mar",..: 1 3 5 4 6 7 8 10 12 2 ...
## .. ..$ n : int [1:12] 10416 10416 10416 10080 9720 9672 9672 9672 9672 9480 ...
## $ dfApplied : tibble [840 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ pct_temperature_2m: num [1:840] 0 0 0 0 1 1 1 1 2 2 ...
## ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 12 3 1 2 12 3 1 2 ...
## ..$ n : int [1:840] 309 204 66 16 515 480 184 43 484 415 ...
## ..$ rankN : num [1:840] 1 2 3 4 1 2 3 4 1 2 ...
## ..$ predicted : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ dfConfusion: tibble [102 × 4] (S3: tbl_df/tbl/data.frame)
## ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 2 2 2 ...
## ..$ predicted: Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 10 12 1 2 3 ...
## ..$ n : int [1:102] 6599 797 952 608 18 115 1327 5479 849 999 ...
## ..$ correct : logi [1:102] TRUE FALSE FALSE FALSE FALSE FALSE ...
# Plots only
simpleOneVarChain(tmpTemp, tgt="month", prd="pct_temperature_2m", returnData=FALSE)
##
## Confusion data based on original target variable: month
## # A tibble: 12 × 7
## month right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 6599 3817 10416 0.634 0.0883 6.17
## 2 Feb 849 8631 9480 0.0896 0.0804 0.114
## 3 Mar 2052 8364 10416 0.197 0.0883 1.23
## 4 Apr 3414 6666 10080 0.339 0.0855 2.96
## 5 May 1534 8882 10416 0.147 0.0883 0.668
## 6 Jun 1227 8493 9720 0.126 0.0824 0.532
## 7 Jul 3279 6393 9672 0.339 0.0820 3.13
## 8 Aug 5016 4656 9672 0.519 0.0820 5.32
## 9 Sep 2601 6759 9360 0.278 0.0794 2.50
## 10 Oct 2594 7078 9672 0.268 0.0820 2.27
## 11 Nov 0 9360 9360 0 0.0794 -1
## 12 Dec 1774 7898 9672 0.183 0.0820 1.24
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 11 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Jan 6599 14465 21064 0.313
## 2 Feb 849 1608 2457 0.346
## 3 Mar 2052 7459 9511 0.216
## 4 Apr 3414 10804 14218 0.240
## 5 May 1534 5517 7051 0.218
## 6 Jun 1227 4759 5986 0.205
## 7 Jul 3279 5675 8954 0.366
## 8 Aug 5016 13331 18347 0.273
## 9 Sep 2601 9486 12087 0.215
## 10 Oct 2594 8115 10709 0.242
## 11 Dec 1774 5778 7552 0.235
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 30939 86997 117936 10416 0.262 0.0883 1.97 101
# Data only
tmpChain_v2 <- simpleOneVarChain(tmpTemp, tgt="month", prd="pct_temperature_2m", printReport=FALSE)
identical(tmpChain_v2, tmpChain)
## [1] TRUE
# Data only using a mapper
tmpChain_v3 <- simpleOneVarChain(tmpTemp,
tgt="month",
prd="pct_temperature_2m",
mapper=tmpChain$mapper,
printReport=FALSE
)
identical(tmpChain_v3, tmpChain)
## [1] TRUE
# Return confusion data
tmpChain_v4 <- simpleOneVarChain(tmpTemp,
tgt="month",
prd="pct_temperature_2m",
mapper=tmpChain$mapper,
printReport=FALSE,
includeConfData=TRUE
)
identical(tmpChain_v4[1:4], tmpChain)
## [1] TRUE
tmpChain_v4$dfConfData
## $dfConfOrig
## # A tibble: 12 × 7
## month right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 6599 3817 10416 0.634 0.0883 6.17
## 2 Feb 849 8631 9480 0.0896 0.0804 0.114
## 3 Mar 2052 8364 10416 0.197 0.0883 1.23
## 4 Apr 3414 6666 10080 0.339 0.0855 2.96
## 5 May 1534 8882 10416 0.147 0.0883 0.668
## 6 Jun 1227 8493 9720 0.126 0.0824 0.532
## 7 Jul 3279 6393 9672 0.339 0.0820 3.13
## 8 Aug 5016 4656 9672 0.519 0.0820 5.32
## 9 Sep 2601 6759 9360 0.278 0.0794 2.50
## 10 Oct 2594 7078 9672 0.268 0.0820 2.27
## 11 Nov 0 9360 9360 0 0.0794 -1
## 12 Dec 1774 7898 9672 0.183 0.0820 1.24
##
## $dfConfPred
## # A tibble: 11 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Jan 6599 14465 21064 0.313
## 2 Feb 849 1608 2457 0.346
## 3 Mar 2052 7459 9511 0.216
## 4 Apr 3414 10804 14218 0.240
## 5 May 1534 5517 7051 0.218
## 6 Jun 1227 4759 5986 0.205
## 7 Jul 3279 5675 8954 0.366
## 8 Aug 5016 13331 18347 0.273
## 9 Sep 2601 9486 12087 0.215
## 10 Oct 2594 8115 10709 0.242
## 11 Dec 1774 5778 7552 0.235
##
## $dfConfOverall
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 30939 86997 117936 10416 0.262 0.0883 1.97 101
The process is then run on a train-test basis for a single variable:
# Add random variables to dataset, then split in to test and train
set.seed(23080412)
tmpTempRand <- tmpTemp %>%
mutate(pct_0005=sample(0:5, size=nrow(.), replace=TRUE),
pct_0025=sample(0:25, size=nrow(.), replace=TRUE),
pct_0100=sample(0:100, size=nrow(.), replace=TRUE),
pct_0250=sample(0:250, size=nrow(.), replace=TRUE),
pct_0500=sample(0:500, size=nrow(.), replace=TRUE),
pct_1000=sample(0:1000, size=nrow(.), replace=TRUE),
pct_2500=sample(0:2500, size=nrow(.), replace=TRUE),
pct_5000=sample(0:5000, size=nrow(.), replace=TRUE),
)
idxTrain <- sort(sample(1:nrow(tmpTempRand), size=round(0.75*nrow(tmpTempRand)), replace=FALSE))
tmpTempTrain <- tmpTempRand[idxTrain, ]
tmpTempTest <- tmpTempRand[-idxTrain, ]
# Full process run on training data
tmpChainTrain <- simpleOneVarChain(tmpTempTrain,
tgt="month",
prd="pct_temperature_2m",
includeConfData=TRUE,
plotDesc="Training data: "
)
##
## Confusion data based on original target variable: month
## # A tibble: 12 × 7
## month right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5266 2515 7781 0.677 0.0880 6.69
## 2 Feb 323 6788 7111 0.0454 0.0804 -0.435
## 3 Mar 1744 6112 7856 0.222 0.0888 1.50
## 4 Apr 2618 4887 7505 0.349 0.0848 3.11
## 5 May 1309 6569 7878 0.166 0.0891 0.866
## 6 Jun 884 6392 7276 0.121 0.0823 0.477
## 7 Jul 2374 4904 7278 0.326 0.0823 2.96
## 8 Aug 3840 3384 7224 0.532 0.0817 5.51
## 9 Sep 1993 5087 7080 0.281 0.0800 2.52
## 10 Oct 1763 5503 7266 0.243 0.0821 1.95
## 11 Nov 0 6943 6943 0 0.0785 -1
## 12 Dec 1133 6121 7254 0.156 0.0820 0.905
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 11 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Jan 5266 11424 16690 0.316
## 2 Feb 323 616 939 0.344
## 3 Mar 1744 6115 7859 0.222
## 4 Apr 2618 8350 10968 0.239
## 5 May 1309 4702 6011 0.218
## 6 Jun 884 3416 4300 0.206
## 7 Jul 2374 4197 6571 0.361
## 8 Aug 3840 10116 13956 0.275
## 9 Sep 1993 7240 9233 0.216
## 10 Oct 1763 5274 7037 0.251
## 11 Dec 1133 3755 4888 0.232
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 23247 65205 88452 7878 0.263 0.0891 1.95 101
# Diagnostics run on testing data
tmpChainTest <- simpleOneVarChain(tmpTempTest,
tgt="month",
prd="pct_temperature_2m",
mapper=tmpChainTrain$mapper,
includeConfData=TRUE,
plotDesc="Testing data: "
)
##
## Confusion data based on original target variable: month
## # A tibble: 12 × 7
## month right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 1766 869 2635 0.670 0.0894 6.50
## 2 Feb 91 2278 2369 0.0384 0.0803 -0.522
## 3 Mar 533 2027 2560 0.208 0.0868 1.40
## 4 Apr 860 1715 2575 0.334 0.0873 2.82
## 5 May 401 2137 2538 0.158 0.0861 0.835
## 6 Jun 287 2157 2444 0.117 0.0829 0.417
## 7 Jul 797 1597 2394 0.333 0.0812 3.10
## 8 Aug 1259 1189 2448 0.514 0.0830 5.19
## 9 Sep 648 1632 2280 0.284 0.0773 2.68
## 10 Oct 533 1873 2406 0.222 0.0816 1.71
## 11 Nov 0 2417 2417 0 0.0820 -1
## 12 Dec 405 2013 2418 0.167 0.0820 1.04
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 11 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Jan 1766 3816 5582 0.316
## 2 Feb 91 219 310 0.294
## 3 Mar 533 2141 2674 0.199
## 4 Apr 860 2783 3643 0.236
## 5 May 401 1537 1938 0.207
## 6 Jun 287 1136 1423 0.202
## 7 Jul 797 1396 2193 0.363
## 8 Aug 1259 3322 4581 0.275
## 9 Sep 648 2469 3117 0.208
## 10 Oct 533 1848 2381 0.224
## 11 Dec 405 1237 1642 0.247
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 7580 21904 29484 2635 0.257 0.0894 1.88 101
# Overall confusion
tmpChainTrain$dfConfData$dfConfOverall
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 23247 65205 88452 7878 0.263 0.0891 1.95 101
tmpChainTest$dfConfData$dfConfOverall
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 7580 21904 29484 2635 0.257 0.0894 1.88 101
The process for running train-test is created as a function:
simpleOneVarTrainTest <- function(dfTrain,
dfTest,
tgt,
prd,
rankType="last",
naMethod=TRUE,
printReport=FALSE,
includeConfData=TRUE,
returnData=TRUE
) {
# FUNCTION ARGUMENTS:
# dfTrain: data frame or tibble with key elements (training data set)
# dfTest: data frame or tibble with key elements (testing data set)
# tgt: target variable
# prd: predictor variable
# rankType: method for breaking ties of same n, passed to base::rank as ties.method=
# naMethod: method for handling NA in ranks, passed to base::rank as na.last=
# printReport: boolean, should the confusion report data and plot be printed?
# includeConfData: boolean, should confusion data be returned?
# returnData: boolean, should data elements be returned?
# Fit the training data
tmpTrain <- simpleOneVarChain(df=dfTrain,
tgt=tgt,
prd=prd,
rankType=rankType,
naMethod=naMethod,
printReport=printReport,
plotDesc="Training data: ",
returnData=TRUE,
includeConfData=includeConfData
)
# Fit the testing data
tmpTest <- simpleOneVarChain(df=dfTest,
tgt=tgt,
prd=prd,
mapper=tmpTrain$mapper,
rankType=rankType,
naMethod=naMethod,
printReport=printReport,
plotDesc="Testing data: ",
returnData=TRUE,
includeConfData=includeConfData
)
# Return data if requested
if(isTRUE(returnData)) list(tmpTrain=tmpTrain, tmpTest=tmpTest)
}
# Full process without plotting
tmpVTT <- simpleOneVarTrainTest(dfTrain=tmpTempTrain,
dfTest=tmpTempTest,
tgt="month",
prd="pct_temperature_2m"
)
str(tmpVTT)
## List of 2
## $ tmpTrain:List of 5
## ..$ dfFit : tibble [836 × 4] (S3: tbl_df/tbl/data.frame)
## .. ..$ pct_temperature_2m: num [1:836] 0 0 0 0 1 1 1 1 2 2 ...
## .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 12 3 1 2 12 3 1 2 ...
## .. ..$ n : int [1:836] 226 152 48 11 391 364 145 36 358 302 ...
## .. ..$ rankN : num [1:836] 1 2 3 4 1 2 3 4 1 2 ...
## ..$ mapper :List of 2
## .. ..$ dfPredictor: tibble [101 × 2] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ pct_temperature_2m: num [1:101] 0 1 2 3 4 5 6 7 8 9 ...
## .. .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 2 1 ...
## .. ..$ dfCommon : tibble [12 × 2] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ month: Factor w/ 12 levels "Jan","Feb","Mar",..: 5 3 1 4 7 6 10 12 8 2 ...
## .. .. ..$ n : int [1:12] 7878 7856 7781 7505 7278 7276 7266 7254 7224 7111 ...
## ..$ dfApplied : tibble [836 × 5] (S3: tbl_df/tbl/data.frame)
## .. ..$ pct_temperature_2m: num [1:836] 0 0 0 0 1 1 1 1 2 2 ...
## .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 12 3 1 2 12 3 1 2 ...
## .. ..$ n : int [1:836] 226 152 48 11 391 364 145 36 358 302 ...
## .. ..$ rankN : num [1:836] 1 2 3 4 1 2 3 4 1 2 ...
## .. ..$ predicted : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..$ dfConfusion: tibble [103 × 4] (S3: tbl_df/tbl/data.frame)
## .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 2 2 2 ...
## .. ..$ predicted: Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 10 12 1 2 3 ...
## .. ..$ n : int [1:103] 5266 268 802 460 29 60 896 4430 323 818 ...
## .. ..$ correct : logi [1:103] TRUE FALSE FALSE FALSE FALSE FALSE ...
## ..$ dfConfData :List of 3
## .. ..$ dfConfOrig : tibble [12 × 7] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
## .. .. ..$ right : int [1:12] 5266 323 1744 2618 1309 884 2374 3840 1993 1763 ...
## .. .. ..$ wrong : int [1:12] 2515 6788 6112 4887 6569 6392 4904 3384 5087 5503 ...
## .. .. ..$ n : int [1:12] 7781 7111 7856 7505 7878 7276 7278 7224 7080 7266 ...
## .. .. ..$ pctRight: num [1:12] 0.6768 0.0454 0.222 0.3488 0.1662 ...
## .. .. ..$ pctNaive: num [1:12] 0.088 0.0804 0.0888 0.0848 0.0891 ...
## .. .. ..$ lift : num [1:12] 6.693 -0.435 1.499 3.111 0.866 ...
## .. ..$ dfConfPred : tibble [11 × 5] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ predicted: Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
## .. .. ..$ right : int [1:11] 5266 323 1744 2618 1309 884 2374 3840 1993 1763 ...
## .. .. ..$ wrong : int [1:11] 11424 616 6115 8350 4702 3416 4197 10116 7240 5274 ...
## .. .. ..$ n : int [1:11] 16690 939 7859 10968 6011 4300 6571 13956 9233 7037 ...
## .. .. ..$ pctRight : num [1:11] 0.316 0.344 0.222 0.239 0.218 ...
## .. ..$ dfConfOverall: tibble [1 × 8] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ right : int 23247
## .. .. ..$ wrong : int 65205
## .. .. ..$ n : int 88452
## .. .. ..$ maxN : int 7878
## .. .. ..$ pctRight: num 0.263
## .. .. ..$ pctNaive: num 0.0891
## .. .. ..$ lift : num 1.95
## .. .. ..$ nBucket : int 101
## $ tmpTest :List of 5
## ..$ dfFit : tibble [792 × 4] (S3: tbl_df/tbl/data.frame)
## .. ..$ pct_temperature_2m: num [1:792] 0 0 0 0 1 1 1 1 2 2 ...
## .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 12 3 1 2 12 3 1 2 ...
## .. ..$ n : int [1:792] 83 52 18 5 124 116 39 7 126 113 ...
## .. ..$ rankN : num [1:792] 1 2 3 4 1 2 3 4 1 2 ...
## ..$ mapper :List of 2
## .. ..$ dfPredictor: tibble [101 × 2] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ pct_temperature_2m: num [1:101] 0 1 2 3 4 5 6 7 8 9 ...
## .. .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 2 1 ...
## .. ..$ dfCommon : tibble [12 × 2] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ month: Factor w/ 12 levels "Jan","Feb","Mar",..: 5 3 1 4 7 6 10 12 8 2 ...
## .. .. ..$ n : int [1:12] 7878 7856 7781 7505 7278 7276 7266 7254 7224 7111 ...
## ..$ dfApplied : tibble [792 × 5] (S3: tbl_df/tbl/data.frame)
## .. ..$ pct_temperature_2m: num [1:792] 0 0 0 0 1 1 1 1 2 2 ...
## .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 12 3 1 2 12 3 1 2 ...
## .. ..$ n : int [1:792] 83 52 18 5 124 116 39 7 126 113 ...
## .. ..$ rankN : num [1:792] 1 2 3 4 1 2 3 4 1 2 ...
## .. ..$ predicted : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..$ dfConfusion: tibble [103 × 4] (S3: tbl_df/tbl/data.frame)
## .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 2 2 2 ...
## .. ..$ predicted: Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 10 12 1 2 3 ...
## .. ..$ n : int [1:103] 1766 96 276 154 10 28 305 1484 91 301 ...
## .. ..$ correct : logi [1:103] TRUE FALSE FALSE FALSE FALSE FALSE ...
## ..$ dfConfData :List of 3
## .. ..$ dfConfOrig : tibble [12 × 7] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
## .. .. ..$ right : int [1:12] 1766 91 533 860 401 287 797 1259 648 533 ...
## .. .. ..$ wrong : int [1:12] 869 2278 2027 1715 2137 2157 1597 1189 1632 1873 ...
## .. .. ..$ n : int [1:12] 2635 2369 2560 2575 2538 2444 2394 2448 2280 2406 ...
## .. .. ..$ pctRight: num [1:12] 0.6702 0.0384 0.2082 0.334 0.158 ...
## .. .. ..$ pctNaive: num [1:12] 0.0894 0.0803 0.0868 0.0873 0.0861 ...
## .. .. ..$ lift : num [1:12] 6.499 -0.522 1.398 2.824 0.835 ...
## .. ..$ dfConfPred : tibble [11 × 5] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ predicted: Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
## .. .. ..$ right : int [1:11] 1766 91 533 860 401 287 797 1259 648 533 ...
## .. .. ..$ wrong : int [1:11] 3816 219 2141 2783 1537 1136 1396 3322 2469 1848 ...
## .. .. ..$ n : int [1:11] 5582 310 2674 3643 1938 1423 2193 4581 3117 2381 ...
## .. .. ..$ pctRight : num [1:11] 0.316 0.294 0.199 0.236 0.207 ...
## .. ..$ dfConfOverall: tibble [1 × 8] (S3: tbl_df/tbl/data.frame)
## .. .. ..$ right : int 7580
## .. .. ..$ wrong : int 21904
## .. .. ..$ n : int 29484
## .. .. ..$ maxN : int 2635
## .. .. ..$ pctRight: num 0.257
## .. .. ..$ pctNaive: num 0.0894
## .. .. ..$ lift : num 1.88
## .. .. ..$ nBucket : int 101
# Extracting key elements of prediction accuracy
map_dfr(.x=tmpVTT, .f=function(x) x$dfConfData$dfConfOverall) %>%
mutate(dataType=names(tmpVTT),
tgt=names(tmpVTT[[1]]$mapper$dfPredictor)[2],
prd=names(tmpVTT[[1]]$mapper$dfPredictor)[1]
) %>%
select(dataType, tgt, prd, everything())
## # A tibble: 2 × 11
## dataType tgt prd right wrong n maxN pctRi…¹ pctNa…² lift nBucket
## <chr> <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 tmpTrain month pct_temp… 23247 65205 88452 7878 0.263 0.0891 1.95 101
## 2 tmpTest month pct_temp… 7580 21904 29484 2635 0.257 0.0894 1.88 101
## # … with abbreviated variable names ¹pctRight, ²pctNaive
Predictive power for each variable on month is explored:
# Get all pct variables
pctVars <- tmpTempTrain %>%
select(starts_with("pct")) %>%
names()
pctVars
## [1] "pct_hour" "pct_temperature_2m"
## [3] "pct_relativehumidity_2m" "pct_dewpoint_2m"
## [5] "pct_apparent_temperature" "pct_pressure_msl"
## [7] "pct_surface_pressure" "pct_precipitation"
## [9] "pct_rain" "pct_snowfall"
## [11] "pct_cloudcover" "pct_cloudcover_low"
## [13] "pct_cloudcover_mid" "pct_cloudcover_high"
## [15] "pct_shortwave_radiation" "pct_direct_radiation"
## [17] "pct_direct_normal_irradiance" "pct_diffuse_radiation"
## [19] "pct_windspeed_10m" "pct_windspeed_100m"
## [21] "pct_winddirection_10m" "pct_winddirection_100m"
## [23] "pct_windgusts_10m" "pct_et0_fao_evapotranspiration"
## [25] "pct_weathercode" "pct_vapor_pressure_deficit"
## [27] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [29] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [31] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [33] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
## [35] "pct_0005" "pct_0025"
## [37] "pct_0100" "pct_0250"
## [39] "pct_0500" "pct_1000"
## [41] "pct_2500" "pct_5000"
# Run each variable and combine as dfr
tmpLiftPct <- map_dfr(.x=pctVars,
.f=function(x) {
tmp <- simpleOneVarTrainTest(dfTrain=tmpTempTrain, dfTest=tmpTempTest, tgt="month", prd=x)
map_dfr(.x=tmp, .f=function(y) y$dfConfData$dfConfOverall) %>%
mutate(dataType=names(tmp),
tgt=names(tmp[[1]]$mapper$dfPredictor)[2],
prd=names(tmp[[1]]$mapper$dfPredictor)[1]
) %>%
select(dataType, tgt, prd, everything())
}
)
tmpLiftPct
## # A tibble: 84 × 11
## dataType tgt prd right wrong n maxN pctRi…¹ pctNa…² lift nBucket
## <chr> <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 tmpTrain month pct_h… 8016 80436 88452 7878 0.0906 0.0891 0.0175 24
## 2 tmpTest month pct_h… 2386 27098 29484 2635 0.0809 0.0894 -0.0945 24
## 3 tmpTrain month pct_t… 23247 65205 88452 7878 0.263 0.0891 1.95 101
## 4 tmpTest month pct_t… 7580 21904 29484 2635 0.257 0.0894 1.88 101
## 5 tmpTrain month pct_r… 9721 78731 88452 7878 0.110 0.0891 0.234 60
## 6 tmpTest month pct_r… 3066 26418 29484 2635 0.104 0.0894 0.164 60
## 7 tmpTrain month pct_d… 21591 66861 88452 7878 0.244 0.0891 1.74 101
## 8 tmpTest month pct_d… 6942 22542 29484 2635 0.235 0.0894 1.63 101
## 9 tmpTrain month pct_a… 23250 65202 88452 7878 0.263 0.0891 1.95 101
## 10 tmpTest month pct_a… 7488 21996 29484 2635 0.254 0.0894 1.84 101
## # … with 74 more rows, and abbreviated variable names ¹pctRight, ²pctNaive
Variables are plotted based on explanatory power on month:
tmpLiftPct %>%
ggplot(aes(x=fct_reorder(prd, lift, min), y=lift)) +
geom_point() +
coord_flip() +
facet_wrap(~c("tmpTest"="1. Test data", "tmpTrain"="2. Training data")[dataType]) +
geom_hline(yintercept=0, lty=2) +
labs(y="Lift (percent correct divided by percent of most frequent month, minus 1)",
x=NULL,
title="Explanatory power of variable on month"
)
Predictive power for each variable on hour (as factor) is explored:
# Run each variable and combine as dfr (pctVars derived previously for month)
tmpLiftHourPct <- map_dfr(.x=pctVars,
.f=function(x) {
tmp <- simpleOneVarTrainTest(dfTrain=tmpTempTrain,
dfTest=tmpTempTest,
tgt="fct_hour",
prd=x
)
map_dfr(.x=tmp, .f=function(y) y$dfConfData$dfConfOverall) %>%
mutate(dataType=names(tmp),
tgt=names(tmp[[1]]$mapper$dfPredictor)[2],
prd=names(tmp[[1]]$mapper$dfPredictor)[1]
) %>%
select(dataType, tgt, prd, everything())
}
)
tmpLiftHourPct
## # A tibble: 84 × 11
## dataType tgt prd right wrong n maxN pctRi…¹ pctNa…² lift nBucket
## <chr> <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 tmpTrain fct_h… pct_… 88452 0 88452 3738 1 0.0423 22.7 24
## 2 tmpTest fct_h… pct_… 29484 0 29484 1276 1 0.0433 22.1 24
## 3 tmpTrain fct_h… pct_… 5499 82953 88452 3738 0.0622 0.0423 0.471 101
## 4 tmpTest fct_h… pct_… 1407 28077 29484 1276 0.0477 0.0433 0.103 101
## 5 tmpTrain fct_h… pct_… 6430 82022 88452 3738 0.0727 0.0423 0.720 60
## 6 tmpTest fct_h… pct_… 1931 27553 29484 1276 0.0655 0.0433 0.513 60
## 7 tmpTrain fct_h… pct_… 4904 83548 88452 3738 0.0554 0.0423 0.312 101
## 8 tmpTest fct_h… pct_… 1224 28260 29484 1276 0.0415 0.0433 -0.0408 101
## 9 tmpTrain fct_h… pct_… 5357 83095 88452 3738 0.0606 0.0423 0.433 101
## 10 tmpTest fct_h… pct_… 1435 28049 29484 1276 0.0487 0.0433 0.125 101
## # … with 74 more rows, and abbreviated variable names ¹pctRight, ²pctNaive
Variables are plotted based on explanatory power on hour:
tmpLiftHourPct %>%
filter(prd != "pct_hour") %>%
ggplot(aes(x=fct_reorder(prd, lift, min), y=lift)) +
geom_point() +
coord_flip() +
facet_wrap(~c("tmpTest"="1. Test data", "tmpTrain"="2. Training data")[dataType]) +
geom_hline(yintercept=0, lty=2) +
labs(y="Lift (percent correct divided by percent of most frequent hour, minus 1)",
x=NULL,
title="Explanatory power of variable on hour (as factor)"
)
Data are converted to add aggregate elements:
# Add random variables to dataset, then split in to test and train
set.seed(23080412) # Same seed as above
tmpTempRand <- tmpTemp %>%
mutate(pct_0005=sample(0:5, size=nrow(.), replace=TRUE),
pct_0025=sample(0:25, size=nrow(.), replace=TRUE),
pct_0100=sample(0:100, size=nrow(.), replace=TRUE),
pct_0250=sample(0:250, size=nrow(.), replace=TRUE),
pct_0500=sample(0:500, size=nrow(.), replace=TRUE),
pct_1000=sample(0:1000, size=nrow(.), replace=TRUE),
pct_2500=sample(0:2500, size=nrow(.), replace=TRUE),
pct_5000=sample(0:5000, size=nrow(.), replace=TRUE),
tod=ifelse(hour>=7 & hour<=18, "Day", "Night"),
season=case_when(month %in% c("Mar", "Apr", "May") ~ "Spring",
month %in% c("Jun", "Jul", "Aug") ~ "Summer",
month %in% c("Sep", "Oct", "Nov") ~ "Fall",
month %in% c("Dec", "Jan", "Feb") ~ "Winter",
TRUE~"typo"
),
todSeason=paste0(season, "-", tod),
tod=factor(tod, levels=c("Day", "Night")),
season=factor(season, levels=c("Spring", "Summer", "Fall", "Winter")),
todSeason=factor(todSeason,
levels=paste0(rep(c("Spring", "Summer", "Fall", "Winter"), each=2),
"-",
c("Day", "Night")
)
)
)
tmpTempRand %>% count(tod)
## # A tibble: 2 × 2
## tod n
## <fct> <int>
## 1 Day 58968
## 2 Night 58968
tmpTempRand %>% count(season)
## # A tibble: 4 × 2
## season n
## <fct> <int>
## 1 Spring 30912
## 2 Summer 29064
## 3 Fall 28392
## 4 Winter 29568
tmpTempRand %>% count(todSeason)
## # A tibble: 8 × 2
## todSeason n
## <fct> <int>
## 1 Spring-Day 15456
## 2 Spring-Night 15456
## 3 Summer-Day 14532
## 4 Summer-Night 14532
## 5 Fall-Day 14196
## 6 Fall-Night 14196
## 7 Winter-Day 14784
## 8 Winter-Night 14784
idxTrain <- sort(sample(1:nrow(tmpTempRand), size=round(0.75*nrow(tmpTempRand)), replace=FALSE))
tmpTempTrain <- tmpTempRand[idxTrain, ]
tmpTempTest <- tmpTempRand[-idxTrain, ]
# Example process for season
simpleOneVarTrainTest(dfTrain=tmpTempTrain,
dfTest=tmpTempTest,
tgt="season",
prd="pct_temperature_2m",
printReport=TRUE,
returnData=FALSE
)
##
## Confusion data based on original target variable: season
## # A tibble: 4 × 7
## season right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Spring 10100 13139 23239 0.435 0.263 0.654
## 2 Summer 19058 2720 21778 0.875 0.246 2.55
## 3 Fall 5045 16244 21289 0.237 0.241 -0.0154
## 4 Winter 18404 3742 22146 0.831 0.250 2.32
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 4 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Spring 10100 10948 21048 0.480
## 2 Summer 19058 9484 28542 0.668
## 3 Fall 5045 6765 11810 0.427
## 4 Winter 18404 8648 27052 0.680
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 52607 35845 88452 23239 0.595 0.263 1.26 101
##
## Confusion data based on original target variable: season
## # A tibble: 4 × 7
## season right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Spring 3382 4291 7673 0.441 0.260 0.694
## 2 Summer 6366 920 7286 0.874 0.247 2.54
## 3 Fall 1660 5443 7103 0.234 0.241 -0.0299
## 4 Winter 6240 1182 7422 0.841 0.252 2.34
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 4 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Spring 3382 3625 7007 0.483
## 2 Summer 6366 3065 9431 0.675
## 3 Fall 1660 2292 3952 0.420
## 4 Winter 6240 2854 9094 0.686
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 17648 11836 29484 7673 0.599 0.260 1.30 101
Predictive power for each variable on season is explored:
# Get all pct variables
pctVars <- tmpTempTrain %>%
select(starts_with("pct")) %>%
names()
pctVars
## [1] "pct_hour" "pct_temperature_2m"
## [3] "pct_relativehumidity_2m" "pct_dewpoint_2m"
## [5] "pct_apparent_temperature" "pct_pressure_msl"
## [7] "pct_surface_pressure" "pct_precipitation"
## [9] "pct_rain" "pct_snowfall"
## [11] "pct_cloudcover" "pct_cloudcover_low"
## [13] "pct_cloudcover_mid" "pct_cloudcover_high"
## [15] "pct_shortwave_radiation" "pct_direct_radiation"
## [17] "pct_direct_normal_irradiance" "pct_diffuse_radiation"
## [19] "pct_windspeed_10m" "pct_windspeed_100m"
## [21] "pct_winddirection_10m" "pct_winddirection_100m"
## [23] "pct_windgusts_10m" "pct_et0_fao_evapotranspiration"
## [25] "pct_weathercode" "pct_vapor_pressure_deficit"
## [27] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [29] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [31] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [33] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
## [35] "pct_0005" "pct_0025"
## [37] "pct_0100" "pct_0250"
## [39] "pct_0500" "pct_1000"
## [41] "pct_2500" "pct_5000"
# Run each variable and combine as dfr
tmpLiftPctSeason <- map_dfr(.x=pctVars,
.f=function(x) {
tmp <- simpleOneVarTrainTest(dfTrain=tmpTempTrain, dfTest=tmpTempTest, tgt="season", prd=x)
map_dfr(.x=tmp, .f=function(y) y$dfConfData$dfConfOverall) %>%
mutate(dataType=names(tmp),
tgt=names(tmp[[1]]$mapper$dfPredictor)[2],
prd=names(tmp[[1]]$mapper$dfPredictor)[1]
) %>%
select(dataType, tgt, prd, everything())
}
)
tmpLiftPctSeason
## # A tibble: 84 × 11
## dataType tgt prd right wrong n maxN pctRi…¹ pctNa…² lift nBucket
## <chr> <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 tmpTrain season pct_h… 23239 65213 88452 23239 0.263 0.263 0 24
## 2 tmpTest season pct_h… 7673 21811 29484 7673 0.260 0.260 0 24
## 3 tmpTrain season pct_t… 52607 35845 88452 23239 0.595 0.263 1.26 101
## 4 tmpTest season pct_t… 17648 11836 29484 7673 0.599 0.260 1.30 101
## 5 tmpTrain season pct_r… 25384 63068 88452 23239 0.287 0.263 0.0923 60
## 6 tmpTest season pct_r… 8233 21251 29484 7673 0.279 0.260 0.0730 60
## 7 tmpTrain season pct_d… 49883 38569 88452 23239 0.564 0.263 1.15 101
## 8 tmpTest season pct_d… 16632 12852 29484 7673 0.564 0.260 1.17 101
## 9 tmpTrain season pct_a… 52802 35650 88452 23239 0.597 0.263 1.27 101
## 10 tmpTest season pct_a… 17678 11806 29484 7673 0.600 0.260 1.30 101
## # … with 74 more rows, and abbreviated variable names ¹pctRight, ²pctNaive
Variables are plotted based on explanatory power on season:
tmpLiftPctSeason %>%
ggplot(aes(x=fct_reorder(prd, lift, min), y=lift)) +
geom_point() +
coord_flip() +
facet_wrap(~c("tmpTest"="1. Test data", "tmpTrain"="2. Training data")[dataType]) +
geom_hline(yintercept=0, lty=2) +
labs(y="Lift (percent correct divided by percent of most frequent season, minus 1)",
x=NULL,
title="Explanatory power of variable on season"
)
tmpLiftPctSeason %>%
ggplot(aes(x=fct_reorder(prd, lift, min), y=lift)) +
geom_point(aes(color=c("tmpTest"="1. Test data", "tmpTrain"="2. Training data")[dataType])) +
coord_flip() +
geom_hline(yintercept=0, lty=2) +
labs(y="Lift (percent correct divided by percent of most frequent season, minus 1)",
x=NULL,
title="Explanatory power of variable on season"
) +
scale_color_discrete(NULL)
Predictive power for each variable on day-night is explored:
# Get all pct variables (exclude hour)
pctVars <- tmpTempTrain %>%
select(starts_with("pct")) %>%
select(-pct_hour) %>%
names()
pctVars
## [1] "pct_temperature_2m" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_weathercode"
## [25] "pct_vapor_pressure_deficit" "pct_soil_temperature_0_to_7cm"
## [27] "pct_soil_temperature_7_to_28cm" "pct_soil_temperature_28_to_100cm"
## [29] "pct_soil_temperature_100_to_255cm" "pct_soil_moisture_0_to_7cm"
## [31] "pct_soil_moisture_7_to_28cm" "pct_soil_moisture_28_to_100cm"
## [33] "pct_soil_moisture_100_to_255cm" "pct_0005"
## [35] "pct_0025" "pct_0100"
## [37] "pct_0250" "pct_0500"
## [39] "pct_1000" "pct_2500"
## [41] "pct_5000"
# Run each variable and combine as dfr
tmpLiftPctDayNight <- map_dfr(.x=pctVars,
.f=function(x) {
tmp <- simpleOneVarTrainTest(dfTrain=tmpTempTrain, dfTest=tmpTempTest, tgt="tod", prd=x)
map_dfr(.x=tmp, .f=function(y) y$dfConfData$dfConfOverall) %>%
mutate(dataType=names(tmp),
tgt=names(tmp[[1]]$mapper$dfPredictor)[2],
prd=names(tmp[[1]]$mapper$dfPredictor)[1]
) %>%
select(dataType, tgt, prd, everything())
}
)
tmpLiftPctDayNight
## # A tibble: 82 × 11
## dataType tgt prd right wrong n maxN pctRi…¹ pctNa…² lift nBucket
## <chr> <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 tmpTrain tod pct_t… 49032 39420 88452 44253 0.554 0.500 1.08e-1 101
## 2 tmpTest tod pct_t… 16252 13232 29484 14769 0.551 0.501 1.00e-1 101
## 3 tmpTrain tod pct_r… 53982 34470 88452 44253 0.610 0.500 2.20e-1 60
## 4 tmpTest tod pct_r… 18097 11387 29484 14769 0.614 0.501 2.25e-1 60
## 5 tmpTrain tod pct_d… 45840 42612 88452 44253 0.518 0.500 3.59e-2 101
## 6 tmpTest tod pct_d… 14940 14544 29484 14769 0.507 0.501 1.16e-2 101
## 7 tmpTrain tod pct_a… 48214 40238 88452 44253 0.545 0.500 8.95e-2 101
## 8 tmpTest tod pct_a… 15884 13600 29484 14769 0.539 0.501 7.55e-2 101
## 9 tmpTrain tod pct_p… 45701 42751 88452 44253 0.517 0.500 3.27e-2 101
## 10 tmpTest tod pct_p… 14774 14710 29484 14769 0.501 0.501 3.39e-4 101
## # … with 72 more rows, and abbreviated variable names ¹pctRight, ²pctNaive
Variables are plotted based on explanatory power on night-day:
tmpLiftPctDayNight %>%
ggplot(aes(x=fct_reorder(prd, lift, min), y=lift)) +
geom_point() +
coord_flip() +
facet_wrap(~c("tmpTest"="1. Test data", "tmpTrain"="2. Training data")[dataType]) +
geom_hline(yintercept=0, lty=2) +
labs(y="Lift (percent correct divided by percent of most frequent day-night, minus 1)",
x=NULL,
title="Explanatory power of variable on day-night"
)
tmpLiftPctDayNight %>%
ggplot(aes(x=fct_reorder(prd, lift, min), y=lift)) +
geom_point(aes(color=c("tmpTest"="1. Test data", "tmpTrain"="2. Training data")[dataType])) +
coord_flip() +
geom_hline(yintercept=0, lty=2) +
labs(y="Lift (percent correct divided by percent of most frequent day-night, minus 1)",
x=NULL,
title="Explanatory power of variable on day-night"
) +
scale_color_discrete(NULL)
Lift by Variable is plotted for season and night-day:
tmpLiftPctSeason %>%
filter(prd != "pct_hour") %>%
bind_rows(tmpLiftPctDayNight) %>%
filter(dataType=="tmpTest") %>%
select(tgt, prd, lift) %>%
pivot_wider(id_cols="prd", names_from="tgt", values_from="lift") %>%
mutate(prdType=case_when(str_detect(prd, "_\\d{4}$")~"4. Random",
str_detect(prd, "radia")|str_detect(prd, "evapotrans")~"2. Radiation/et0",
str_detect(prd, "temper|dewp|vapor|soil")~"1. Temp/Dew/Vapor/Soil",
TRUE ~ "3. Other"
)
) %>%
ggplot(aes(x=season, y=tod)) +
geom_point(aes(color=prdType)) +
labs(x="Lift (season)",
y="Lift (night-day)",
title="Lift on test data of single-variable predictor"
) +
geom_hline(yintercept=c(0, 0.25), lty=2) +
geom_vline(xintercept=c(0, 0.5), lty=2) +
facet_wrap(~prdType) +
scale_color_discrete("Type")
Predictive power for each variable on day-night-season is explored:
# Get all pct variables (exclude hour)
pctVars <- tmpTempTrain %>%
select(starts_with("pct")) %>%
select(-pct_hour) %>%
names()
pctVars
## [1] "pct_temperature_2m" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_weathercode"
## [25] "pct_vapor_pressure_deficit" "pct_soil_temperature_0_to_7cm"
## [27] "pct_soil_temperature_7_to_28cm" "pct_soil_temperature_28_to_100cm"
## [29] "pct_soil_temperature_100_to_255cm" "pct_soil_moisture_0_to_7cm"
## [31] "pct_soil_moisture_7_to_28cm" "pct_soil_moisture_28_to_100cm"
## [33] "pct_soil_moisture_100_to_255cm" "pct_0005"
## [35] "pct_0025" "pct_0100"
## [37] "pct_0250" "pct_0500"
## [39] "pct_1000" "pct_2500"
## [41] "pct_5000"
# Run each variable and combine as dfr
tmpLiftPctDayNightSeason <- map_dfr(.x=pctVars,
.f=function(x) {
tmp <- simpleOneVarTrainTest(dfTrain=tmpTempTrain, dfTest=tmpTempTest, tgt="todSeason", prd=x)
map_dfr(.x=tmp, .f=function(y) y$dfConfData$dfConfOverall) %>%
mutate(dataType=names(tmp),
tgt=names(tmp[[1]]$mapper$dfPredictor)[2],
prd=names(tmp[[1]]$mapper$dfPredictor)[1]
) %>%
select(dataType, tgt, prd, everything())
}
)
tmpLiftPctDayNightSeason
## # A tibble: 82 × 11
## dataType tgt prd right wrong n maxN pctRi…¹ pctNa…² lift nBucket
## <chr> <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 tmpTrain todSeas… pct_… 30074 58378 88452 11702 0.340 0.132 1.57 101
## 2 tmpTest todSeas… pct_… 9984 19500 29484 3919 0.339 0.133 1.55 101
## 3 tmpTrain todSeas… pct_… 15732 72720 88452 11702 0.178 0.132 0.344 60
## 4 tmpTest todSeas… pct_… 5070 24414 29484 3919 0.172 0.133 0.294 60
## 5 tmpTrain todSeas… pct_… 26097 62355 88452 11702 0.295 0.132 1.23 101
## 6 tmpTest todSeas… pct_… 8457 21027 29484 3919 0.287 0.133 1.16 101
## 7 tmpTrain todSeas… pct_… 29669 58783 88452 11702 0.335 0.132 1.54 101
## 8 tmpTest todSeas… pct_… 9829 19655 29484 3919 0.333 0.133 1.51 101
## 9 tmpTrain todSeas… pct_… 17524 70928 88452 11702 0.198 0.132 0.498 101
## 10 tmpTest todSeas… pct_… 5614 23870 29484 3919 0.190 0.133 0.433 101
## # … with 72 more rows, and abbreviated variable names ¹pctRight, ²pctNaive
Variables are plotted based on explanatory power on night-day-season:
tmpLiftPctDayNightSeason %>%
ggplot(aes(x=fct_reorder(prd, lift, min), y=lift)) +
geom_point() +
coord_flip() +
facet_wrap(~c("tmpTest"="1. Test data", "tmpTrain"="2. Training data")[dataType]) +
geom_hline(yintercept=0, lty=2) +
labs(y="Lift (percent correct divided by percent of most frequent day-night-season, minus 1)",
x=NULL,
title="Explanatory power of variable on day-night-season"
)
tmpLiftPctDayNightSeason %>%
ggplot(aes(x=fct_reorder(prd, lift, min), y=lift)) +
geom_point(aes(color=c("tmpTest"="1. Test data", "tmpTrain"="2. Training data")[dataType])) +
coord_flip() +
geom_hline(yintercept=0, lty=2) +
labs(y="Lift (percent correct divided by percent of most frequent day-night-season, minus 1)",
x=NULL,
title="Explanatory power of variable on day-night-season"
) +
scale_color_discrete(NULL)
# Example process for season
simpleOneVarTrainTest(dfTrain=tmpTempTrain,
dfTest=tmpTempTest,
tgt="season",
prd="pct_temperature_2m",
printReport=TRUE,
returnData=FALSE
)
##
## Confusion data based on original target variable: season
## # A tibble: 4 × 7
## season right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Spring 10100 13139 23239 0.435 0.263 0.654
## 2 Summer 19058 2720 21778 0.875 0.246 2.55
## 3 Fall 5045 16244 21289 0.237 0.241 -0.0154
## 4 Winter 18404 3742 22146 0.831 0.250 2.32
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 4 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Spring 10100 10948 21048 0.480
## 2 Summer 19058 9484 28542 0.668
## 3 Fall 5045 6765 11810 0.427
## 4 Winter 18404 8648 27052 0.680
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 52607 35845 88452 23239 0.595 0.263 1.26 101
##
## Confusion data based on original target variable: season
## # A tibble: 4 × 7
## season right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Spring 3382 4291 7673 0.441 0.260 0.694
## 2 Summer 6366 920 7286 0.874 0.247 2.54
## 3 Fall 1660 5443 7103 0.234 0.241 -0.0299
## 4 Winter 6240 1182 7422 0.841 0.252 2.34
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 4 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Spring 3382 3625 7007 0.483
## 2 Summer 6366 3065 9431 0.675
## 3 Fall 1660 2292 3952 0.420
## 4 Winter 6240 2854 9094 0.686
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 17648 11836 29484 7673 0.599 0.260 1.30 101
The top-performing Variable for night-day-season is plotted:
# Example process for night-day-season
simpleOneVarTrainTest(dfTrain=tmpTempTrain,
dfTest=tmpTempTest,
tgt="todSeason",
prd="pct_soil_temperature_0_to_7cm",
printReport=TRUE,
returnData=FALSE
)
##
## Confusion data based on original target variable: todSeason
## # A tibble: 8 × 7
## todSeason right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Spring-Day 2389 9313 11702 0.204 0.132 0.543
## 2 Spring-Night 5597 5940 11537 0.485 0.130 2.72
## 3 Summer-Day 6841 4026 10867 0.630 0.123 4.12
## 4 Summer-Night 7412 3499 10911 0.679 0.123 4.51
## 5 Fall-Day 0 10614 10614 0 0.120 -1
## 6 Fall-Night 864 9811 10675 0.0809 0.121 -0.329
## 7 Winter-Day 3252 7818 11070 0.294 0.125 1.35
## 8 Winter-Night 6800 4276 11076 0.614 0.125 3.90
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 7 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Spring-Day 2389 6462 8851 0.270
## 2 Spring-Night 5597 14669 20266 0.276
## 3 Summer-Day 6841 4142 10983 0.623
## 4 Summer-Night 7412 12127 19539 0.379
## 5 Fall-Night 864 2628 3492 0.247
## 6 Winter-Day 3252 4003 7255 0.448
## 7 Winter-Night 6800 11266 18066 0.376
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 33155 55297 88452 11702 0.375 0.132 1.83 101
##
## Confusion data based on original target variable: todSeason
## # A tibble: 8 × 7
## todSeason right wrong n pctRight pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Spring-Day 727 3027 3754 0.194 0.127 0.521
## 2 Spring-Night 1839 2080 3919 0.469 0.133 2.53
## 3 Summer-Day 2333 1332 3665 0.637 0.124 4.12
## 4 Summer-Night 2510 1111 3621 0.693 0.123 4.64
## 5 Fall-Day 0 3582 3582 0 0.121 -1
## 6 Fall-Night 289 3232 3521 0.0821 0.119 -0.313
## 7 Winter-Day 1083 2631 3714 0.292 0.126 1.31
## 8 Winter-Night 2251 1457 3708 0.607 0.126 3.83
##
## Confusion data based on predicted target variable: predicted
## # A tibble: 7 × 5
## predicted right wrong n pctRight
## <fct> <int> <int> <int> <dbl>
## 1 Spring-Day 727 2186 2913 0.250
## 2 Spring-Night 1839 4997 6836 0.269
## 3 Summer-Day 2333 1304 3637 0.641
## 4 Summer-Night 2510 3939 6449 0.389
## 5 Fall-Night 289 874 1163 0.248
## 6 Winter-Day 1083 1328 2411 0.449
## 7 Winter-Night 2251 3824 6075 0.371
##
## Overall confusion matrix
## # A tibble: 1 × 8
## right wrong n maxN pctRight pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 11032 18452 29484 3919 0.374 0.133 1.82 101
Data are explored for k-means:
# Create data
kmTrain <- tmpTempTrain %>%
select(time, starts_with("pct")) %>%
select(-pct_hour, -pct_weathercode, -ends_with("0"), -ends_with("5"))
names(kmTrain)
## [1] "time" "pct_temperature_2m"
## [3] "pct_relativehumidity_2m" "pct_dewpoint_2m"
## [5] "pct_apparent_temperature" "pct_pressure_msl"
## [7] "pct_surface_pressure" "pct_precipitation"
## [9] "pct_rain" "pct_snowfall"
## [11] "pct_cloudcover" "pct_cloudcover_low"
## [13] "pct_cloudcover_mid" "pct_cloudcover_high"
## [15] "pct_shortwave_radiation" "pct_direct_radiation"
## [17] "pct_direct_normal_irradiance" "pct_diffuse_radiation"
## [19] "pct_windspeed_10m" "pct_windspeed_100m"
## [21] "pct_winddirection_10m" "pct_winddirection_100m"
## [23] "pct_windgusts_10m" "pct_et0_fao_evapotranspiration"
## [25] "pct_vapor_pressure_deficit" "pct_soil_temperature_0_to_7cm"
## [27] "pct_soil_temperature_7_to_28cm" "pct_soil_temperature_28_to_100cm"
## [29] "pct_soil_temperature_100_to_255cm" "pct_soil_moisture_0_to_7cm"
## [31] "pct_soil_moisture_7_to_28cm" "pct_soil_moisture_28_to_100cm"
## [33] "pct_soil_moisture_100_to_255cm"
# Confirm that mean and sd are reasonably similar
kmSD <- kmTrain %>%
summarize(across(starts_with("pct"), .fns=list(mean=mean, sd=sd))) %>%
pivot_longer(cols=everything()) %>%
mutate(metric=str_remove_all(name, pattern="pct_|_mean|_sd"),
type=str_extract(name, pattern="[a-zA-Z0-9]+$")
) %>%
pivot_wider(id_cols="metric", names_from="type", values_from="value")
kmSD %>%
ggplot(aes(x=mean, y=sd)) +
geom_point(alpha=0.5) +
labs(title="Mean and standard deviation for potential k-means variables", x="Mean", y="SD")
kmSD %>%
filter(mean<=20)
## # A tibble: 3 × 3
## metric mean sd
## <chr> <dbl> <dbl>
## 1 precipitation 13.4 32.5
## 2 rain 11.5 30.7
## 3 snowfall 3.53 18.2
# Initial k-means with two centers
set.seed(23081914)
kmTrain_002 <- kmTrain %>%
select(-time) %>%
kmeans(centers=2)
kmTrain_002$centers %>%
tibble::as_tibble() %>%
mutate(cluster=row_number()) %>%
pivot_longer(cols=-c(cluster)) %>%
ggplot(aes(x=fct_reorder(str_remove(name, "pct_"), value, .fun=function(a) a[2]-a[1]), y=value)) +
geom_point(aes(color=factor(cluster))) +
scale_color_discrete("Cluster") +
facet_wrap(~factor(cluster)) +
labs(title="Cluster means (kmeans, centers=2)", x="Metric", y="Cluster mean") +
lims(y=c(0, 100)) +
geom_hline(yintercept=40, lty=2) +
coord_flip()
Clusters are assessed:
kmAssess <- kmTrain %>%
mutate(cl=factor(kmTrain_002$cluster),
fct_month=factor(month.abb[month(time)], levels=month.abb),
hour=as.integer(hour(time)),
tod=ifelse(hour>=7 & hour<=18, "Day", "Night"),
season=case_when(fct_month %in% c("Mar", "Apr", "May") ~ "Spring",
fct_month %in% c("Jun", "Jul", "Aug") ~ "Summer",
fct_month %in% c("Sep", "Oct", "Nov") ~ "Fall",
fct_month %in% c("Dec", "Jan", "Feb") ~ "Winter",
TRUE~"typo"
),
todSeason=paste0(season, "-", tod),
tod=factor(tod, levels=c("Day", "Night")),
season=factor(season, levels=c("Spring", "Summer", "Fall", "Winter")),
todSeason=factor(todSeason,
levels=paste0(rep(c("Spring", "Summer", "Fall", "Winter"), each=2),
"-",
c("Day", "Night")
)
)
)
# Assessed by month and hour
kmAssess %>%
count(fct_month, hour, cl) %>%
group_by(fct_month, hour) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(y=fct_month, x=hour)) +
geom_tile(aes(fill=pct)) +
facet_wrap(~cl, nrow=1) +
scale_fill_continuous(low="white", high="green") +
labs(title="Percentage by cluster (kmeans with 2 centers)", x="Hour", y=NULL)
# Assessed by todSeason
kmAssess %>%
count(todSeason, cl) %>%
group_by(todSeason) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(y=fct_reorder(todSeason, pct, .fun=function(x) x[1]), x=factor(cl))) +
geom_tile(aes(fill=pct)) +
scale_fill_continuous(low="white", high="green") +
labs(title="Percentage by cluster (kmeans with 2 centers)", x="Hour", y=NULL)
A function is written for creating k-means:
plotClusterMeans <- function(km, nrow=NULL, ncol=NULL, scales="fixed") {
# FUNCTION ARGUMENTS
# km: object returned by stats::kmeans(...)
# nrow: number of rows for faceting (NULL means default)
# ncol: number of columns for faceting (NULL means default)
# scales: passed to facet_wrap as scales=scales
# Assess clustering by dimension
p1 <- km$centers %>%
tibble::as_tibble() %>%
mutate(cluster=row_number()) %>%
pivot_longer(cols=-c(cluster)) %>%
ggplot(aes(x=fct_reorder(name,
value,
.fun=function(a) ifelse(length(a)==2, a[2]-a[1], diff(range(a)))
),
y=value
)
) +
geom_point(aes(color=factor(cluster))) +
scale_color_discrete("Cluster") +
facet_wrap(~factor(cluster), nrow=nrow, ncol=ncol, scales=scales) +
labs(title=paste0("Cluster means (kmeans, centers=", nrow(km$centers), ")"),
x="Metric",
y="Cluster mean"
) +
geom_hline(yintercept=median(km$centers), lty=2) +
coord_flip()
print(p1)
}
plotClusterPct <- function(df, km, keyVars, nRowFacet=1, printPlot=TRUE) {
# FUNCTION ARGUMENTS:
# df: data frame initially passed to stats::kmeans(...)
# km: object returned by stats::kmeans(...)
# keyVars: character vector of length 1 (y-only, x will be cl) or length 2 (x, y, cl will facet)
# nRowFacet: number of rows for facetting (only relevant if length(keyVars) is 2)
# printPlot: boolean, should plot be printed? (if not true, plot will be returned)
# Check length of keyVars
if(!(length(keyVars) %in% c(1, 2))) stop("\nArgument keyVars must be length-1 or length-2\n")
p1 <- df %>%
mutate(cl=factor(km$cluster)) %>%
group_by(across(c(all_of(keyVars), "cl"))) %>%
summarize(n=n(), .groups="drop") %>%
group_by(across(all_of(keyVars))) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot() +
scale_fill_continuous(low="white", high="green") +
labs(title=paste0("Percentage by cluster (kmeans with ", nrow(km$centers), " centers)"),
x=ifelse(length(keyVars)==1, "Cluster", keyVars[1]),
y=ifelse(length(keyVars)==1, keyVars[1], keyVars[2])
)
if(length(keyVars)==1) p1 <- p1 + geom_tile(aes(fill=pct, x=cl, y=get(keyVars[1])))
if(length(keyVars)==2) {
p1 <- p1 +
geom_tile(aes(fill=pct, x=get(keyVars[1]), y=get(keyVars[2]))) +
facet_wrap(~cl, nrow=nRowFacet)
}
if(isTRUE(printPlot)) print(p1)
else return(p1)
}
runKMeans <- function(df,
vars=NULL,
centers=2,
nStart=1L,
iter.max=10L,
seed=NULL,
plotMeans=FALSE,
nrowMeans=NULL,
plotPct=NULL,
nrowPct=1
) {
# FUNCTION ARGUMENTS:
# df: data frame for clustering
# vars: variables to be used for clustering (NULL means everything in df)
# centers: number of centers
# nStart: passed to kmeans
# iter.max: passed to kmeans
# seed: seed to be set (if NULL, no seed is set)
# plotMeans: boolean, plot variable means by cluster?
# nrowMeans: argument passed as nrow for faceting rows in plotClusterMeans() - NULL is default ggplot2
# plotPct: list of character vectors to be passed sequentially as keyVars to plotClusterPct()
# NULL means do not run
# pctByCluster=list(c("var1"), c("var2", "var3")) will run plotting twice
# nrowPct: argument for faceting number of rows in plotClusterPct()
# Set seed if requested
if(!is.null(seed)) set.seed(seed)
# Get the variable names if passed as NULL
if(is.null(vars)) vars <- names(df)
# Run the k-means process
km <- df %>%
select(all_of(vars)) %>%
kmeans(centers=centers, iter.max=iter.max, nstart=nStart)
# Assess clustering by dimension if requested
if(isTRUE(plotMeans)) plotClusterMeans(km, nrow=nrowMeans)
if(!is.null((plotPct)))
for(ctr in 1:length(plotPct))
plotClusterPct(df=df, km=km, keyVars=plotPct[[ctr]], nRowFacet=nrowPct)
# Return the k-means object
km
}
# Get relevant variables
varsTrain <- tmpTempTrain %>%
select(starts_with("pct")) %>%
select(-pct_hour, -pct_weathercode, -ends_with("0"), -ends_with("5")) %>%
names()
varsTrain
## [1] "pct_temperature_2m" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_vapor_pressure_deficit"
## [25] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [27] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [29] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [31] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
km003 <- runKMeans(tmpTempTrain,
vars=varsTrain,
centers=3,
nStart=25,
seed=23082113,
iter.max=20L,
plotMeans=TRUE,
plotPct=list(c("todSeason"), c("hour", "month")),
nrowPct=1
)
str(km003)
## List of 9
## $ cluster : int [1:88452] 2 2 2 2 2 2 2 2 2 2 ...
## $ centers : num [1:3, 1:32] 65.9 22.5 70 60.8 55.1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:32] "pct_temperature_2m" "pct_relativehumidity_2m" "pct_dewpoint_2m" "pct_apparent_temperature" ...
## $ totss : num 2.72e+09
## $ withinss : num [1:3] 4.00e+08 8.61e+08 6.23e+08
## $ tot.withinss: num 1.88e+09
## $ betweenss : num 8.33e+08
## $ size : int [1:3] 21790 35494 31168
## $ iter : int 4
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
plotClusterMeans(km003)
plotClusterPct(df=tmpTempTrain, km=km003, keyVars=c("todSeason"))
plotClusterPct(df=tmpTempTrain, km=km003, keyVars=c("hour", "month"))
The first split with 2 clusters is explored:
km002 <- runKMeans(tmpTempTrain,
vars=varsTrain,
centers=2,
nStart=25,
seed=23082113,
iter.max=20L,
plotMeans=TRUE,
plotPct=list(c("todSeason"), c("hour", "month")),
nrowPct=1
)
str(km002)
## List of 9
## $ cluster : int [1:88452] 2 2 2 2 2 2 2 2 1 1 ...
## $ centers : num [1:2, 1:32] 60.1 41.3 36.7 59.8 56 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:32] "pct_temperature_2m" "pct_relativehumidity_2m" "pct_dewpoint_2m" "pct_apparent_temperature" ...
## $ totss : num 2.72e+09
## $ withinss : num [1:2] 9.54e+08 1.20e+09
## $ tot.withinss: num 2.15e+09
## $ betweenss : num 5.68e+08
## $ size : int [1:2] 40717 47735
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
The first split appears to be day-night, based primarily on radiation and evapotranspiration.
Sum-squares are explored:
km001 <- runKMeans(tmpTempTrain,
vars=varsTrain,
centers=1,
nStart=25,
seed=23082113,
iter.max=20L,
plotMeans=TRUE,
plotPct=list(c("todSeason"), c("hour", "month")),
nrowPct=1
)
str(km001)
## List of 9
## $ cluster : int [1:88452] 1 1 1 1 1 1 1 1 1 1 ...
## $ centers : num [1, 1:32] 49.9 49.2 49.9 49.9 49.8 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr "1"
## .. ..$ : chr [1:32] "pct_temperature_2m" "pct_relativehumidity_2m" "pct_dewpoint_2m" "pct_apparent_temperature" ...
## $ totss : num 2.72e+09
## $ withinss : num 2.72e+09
## $ tot.withinss: num 2.72e+09
## $ betweenss : num -0.00118
## $ size : int 88452
## $ iter : int 1
## $ ifault : NULL
## - attr(*, "class")= chr "kmeans"
sapply(list(km001, km002, km003), FUN=function(x) c("k"=length(x$size),
"totss"=x$totss,
"betweenss"=x$betweenss,
"tot.withinss"=x$tot.withinss,
"iter"=x$iter,
"ifault"=ifelse(is.null(x$ifault), 0, x$ifault)
)
) %>%
t() %>%
tibble::as_tibble() %>%
mutate(pct=pmin(1, tot.withinss/totss)) %>%
ggplot(aes(x=k, y=pct)) +
geom_line() +
geom_point() +
geom_text(aes(y=pct-0.05, label=round(pct, 3)), size=2.5) +
labs(x="# Clusters",
y="SS-within / SS-total",
title="Sum-squares within as proportion of sum-squares total"
) +
lims(y=c(0, 1))
The first cluster (k=2) primarily splits day from night and accounts for ~20% of total sum-squares. The next cluster (k=3) splits colder-season from warmer-season and accounts for an additional ~10% of total sum-squares
Clusters are run for 1-15 centers, cached to reduce processing time:
kmList <- lapply(1:15, FUN=function(x) runKMeans(tmpTempTrain,
vars=varsTrain,
centers=x,
nStart=25,
seed=23082113,
iter.max=50L,
plotMeans=FALSE,
plotPct=NULL
)
)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
Change in SS-between is explored based on number of clusters:
dfSS <- sapply(kmList, FUN=function(x) c("nCluster"=length(x$size),
"totss"=x$totss,
"betweenss"=x$betweenss,
"tot.withinss"=x$tot.withinss,
"iter"=x$iter,
"ifault"=unclass(ifelse(is.null(x$ifault), 0, x$ifault))
)
) %>%
t() %>%
tibble::as_tibble() %>%
mutate(pct=tot.withinss/totss, dpct=pct-lag(pct))
dfSS
## # A tibble: 15 × 8
## nCluster totss betweenss tot.withinss iter ifault pct dpct
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2717574424. -1.18e-3 2717574424. 1 0 1.00 NA
## 2 2 2717574424. 5.68e+8 2149203287. 1 0 0.791 -0.209
## 3 3 2717574424. 8.33e+8 1884232299. 4 0 0.693 -0.0975
## 4 4 2717574424. 1.02e+9 1693334118. 5 0 0.623 -0.0702
## 5 5 2717574424. 1.19e+9 1526994027. 4 0 0.562 -0.0612
## 6 6 2717574424. 1.25e+9 1470443583. 8 0 0.541 -0.0208
## 7 7 2717574424. 1.29e+9 1423572027. 7 0 0.524 -0.0172
## 8 8 2717574424. 1.34e+9 1378112124. 7 0 0.507 -0.0167
## 9 9 2717574424. 1.38e+9 1337992901. 8 0 0.492 -0.0148
## 10 10 2717574424. 1.41e+9 1303760015. 5 0 0.480 -0.0126
## 11 11 2717574424. 1.44e+9 1273189435. 8 0 0.469 -0.0112
## 12 12 2717574424. 1.47e+9 1242878925. 8 0 0.457 -0.0112
## 13 13 2717574424. 1.50e+9 1216883933. 15 0 0.448 -0.00957
## 14 14 2717574424. 1.52e+9 1194305783. 9 0 0.439 -0.00831
## 15 15 2717574424. 1.54e+9 1175263904. 8 0 0.432 -0.00701
dfSS %>%
ggplot(aes(x=nCluster, y=pmin(1, pct))) +
geom_point() +
geom_line() +
lims(y=c(0, 1)) +
labs(x="# Clusters",
y="Within SS / Total SS",
title="Sum-squares ratio by number of clusters (k=means)"
) +
geom_text(aes(y=pct-0.05, label=round(pct, 3)), size=2.5)
dfSS %>%
filter(!is.na(dpct)) %>%
ggplot(aes(x=factor(nCluster), y=dpct)) +
geom_col(fill="lightblue") +
labs(x="When adding this cluster",
y="Change in (Within SS / Total SS)",
title="Sum-squares ratio by number of clusters (k=means)"
) +
geom_text(aes(y=dpct/2, label=round(dpct, 3)), size=2.5)
This is suggestive that exploring evolution of data splits at k = 1, 2, 3, 4, 5 may be informative
The runKMeans() function is updated to allow for passing a k-means object:
# Updated to allow passing a k-means object
runKMeans <- function(df,
km=NULL,
vars=NULL,
centers=2,
nStart=1L,
iter.max=10L,
seed=NULL,
plotMeans=FALSE,
nrowMeans=NULL,
plotPct=NULL,
nrowPct=1,
returnKM=is.null(km)
) {
# FUNCTION ARGUMENTS:
# df: data frame for clustering
# km: k-means object (will shut off k-means processing and run as plot-only)
# vars: variables to be used for clustering (NULL means everything in df)
# centers: number of centers
# nStart: passed to kmeans
# iter.max: passed to kmeans
# seed: seed to be set (if NULL, no seed is set)
# plotMeans: boolean, plot variable means by cluster?
# nrowMeans: argument passed as nrow for faceting rows in plotClusterMeans() - NULL is default ggplot2
# plotPct: list of character vectors to be passed sequentially as keyVars to plotClusterPct()
# NULL means do not run
# pctByCluster=list(c("var1"), c("var2", "var3")) will run plotting twice
# nrowPct: argument for faceting number of rows in plotClusterPct()
# returnKM: boolean, should the k-means object be returned?
# Set seed if requested
if(!is.null(seed)) set.seed(seed)
# Get the variable names if passed as NULL
if(is.null(vars)) vars <- names(df)
# Run the k-means process if the object has not been passed
if(is.null(km)) {
km <- df %>%
select(all_of(vars)) %>%
kmeans(centers=centers, iter.max=iter.max, nstart=nStart)
}
# Assess clustering by dimension if requested
if(isTRUE(plotMeans)) plotClusterMeans(km, nrow=nrowMeans)
if(!is.null((plotPct)))
for(ctr in 1:length(plotPct))
plotClusterPct(df=df, km=km, keyVars=plotPct[[ctr]], nRowFacet=nrowPct)
# Return the k-means object
if(isTRUE(returnKM)) return(km)
}
# Function run on the 3-cluster k-means object
runKMeans(df=tmpTempTrain,
km=km003,
plotMeans=TRUE,
plotPct=list(c("todSeason"), c("hour", "month")),
nrowPct=1
)
A function is written to assign points to the nearest cluster centroid:
assignKMeans <- function(km, df, returnAllDistanceData=FALSE) {
# FUNCTION ARGUMENTS:
# km: a k-means object
# df: data frame or tibble
# returnAllDistanceData: boolean, should the distance data and clusters be returned?
# TRUE returns a data frame with distances as V1, V2, ..., and cluster as cl
# FALSE returns a vector of cluster assignments as integers
# Select columns from df to match km
df <- df %>% select(all_of(colnames(km$centers)))
if(!all.equal(names(df), colnames(km$centers))) stop("\nName mismatch in clustering and frame\n")
# Create the distances and find clusters
distClust <- sapply(seq_len(nrow(km$centers)),
FUN=function(x) sqrt(rowSums(sweep(as.matrix(df),
2,
t(as.matrix(km$centers[x,,drop=FALSE]))
)**2
)
)
) %>%
as.data.frame() %>%
tibble::as_tibble() %>%
mutate(cl=apply(., 1, which.min))
# Return the proper file
if(isTRUE(returnAllDistanceData)) return(distClust)
else return(distClust$cl)
}
# Example of returning distance data
glimpse(assignKMeans(km=km003, df=tmpTempTrain, returnAllDistanceData=TRUE))
## Rows: 88,452
## Columns: 4
## $ V1 <dbl> 213.5079, 213.6346, 212.8142, 214.0806, 214.9293, 215.3680, 214.573…
## $ V2 <dbl> 118.3220, 119.1694, 122.5357, 123.3177, 123.4207, 123.4264, 122.838…
## $ V3 <dbl> 264.9878, 268.2284, 269.5186, 271.1512, 271.9947, 272.7043, 272.358…
## $ cl <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
# Confirmation that cluster assignments match (could occasionally have a tied distance and possible mismatch)
table(assignKMeans(km=km003, df=tmpTempTrain), km003$cluster)
##
## 1 2 3
## 1 21790 0 0
## 2 0 35494 0
## 3 0 0 31168
Clustering with k=4 is explored:
# Function run on the 4-cluster k-means object
runKMeans(df=tmpTempTrain,
km=kmList[[4]],
plotMeans=TRUE,
plotPct=list(c("todSeason"), c("hour", "month")),
nrowPct=1,
nrowMeans=1
)
With 4 clusters, data are broadly split as warm/cold season and day/night
Clustering with k=5 is explored:
# Function run on the 5-cluster k-means object
runKMeans(df=tmpTempTrain,
km=kmList[[5]],
plotMeans=TRUE,
plotPct=list(c("todSeason"), c("hour", "month")),
nrowPct=1,
nrowMeans=1
)
With 5 clusters, data are broadly split as precipitation/no with “no” further split as warm/cold season and day/night
Principal component analysis is run to explore variance explained by number of components:
# Correlation analysis
corTrain <- cor(tmpTempTrain[, varsTrain])
hcTrain <- hclust(as.dist((1-corTrain)/2))
orderTrain <- hcTrain$order %>% purrr::set_names(hcTrain$labels)
tmpHeat <- as.data.frame(corTrain, row.names=rownames(corTrain)) %>%
rownames_to_column("var1") %>%
tibble::as_tibble() %>%
pivot_longer(cols=-c("var1"), names_to="var2") %>%
mutate()
tmpHeat %>%
ggplot(aes(x=fct_reorder(var1, orderTrain[var1]), y=fct_reorder(var2, orderTrain[var2]))) +
geom_tile(aes(fill=value)) +
geom_text(aes(label=round(value, 2)), size=2) +
scale_fill_gradient2(low="red", mid="white", high="green") +
labs(x=NULL, y=NULL, title="Correlations") +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))
pcaTrain <- prcomp(tmpTempTrain[, varsTrain])
summary(pcaTrain)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 92.9569 72.4488 65.4697 49.8242 42.28983 40.1619
## Proportion of Variance 0.2812 0.1708 0.1395 0.0808 0.05821 0.0525
## Cumulative Proportion 0.2812 0.4521 0.5916 0.6724 0.73060 0.7831
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 35.41510 33.3023 29.70577 28.30567 25.17545 21.86309
## Proportion of Variance 0.04082 0.0361 0.02872 0.02608 0.02063 0.01556
## Cumulative Proportion 0.82392 0.8600 0.88874 0.91482 0.93544 0.95100
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 18.17946 14.89146 12.83779 12.5134 9.85089 9.7572
## Proportion of Variance 0.01076 0.00722 0.00536 0.0051 0.00316 0.0031
## Cumulative Proportion 0.96176 0.96898 0.97434 0.9794 0.98260 0.9857
## PC19 PC20 PC21 PC22 PC23 PC24 PC25
## Standard deviation 9.04743 8.63703 7.81781 7.45580 7.02528 5.32338 5.29881
## Proportion of Variance 0.00266 0.00243 0.00199 0.00181 0.00161 0.00092 0.00091
## Cumulative Proportion 0.98836 0.99079 0.99278 0.99459 0.99619 0.99711 0.99803
## PC26 PC27 PC28 PC29 PC30 PC31 PC32
## Standard deviation 4.2993 4.13242 3.61050 2.52868 1.83323 1.18543 0.90933
## Proportion of Variance 0.0006 0.00056 0.00042 0.00021 0.00011 0.00005 0.00003
## Cumulative Proportion 0.9986 0.99919 0.99961 0.99982 0.99993 0.99997 1.00000
tibble::tibble(sd=pcaTrain$sdev, var=sd**2, n=1:length(pcaTrain$sdev)) %>%
ggplot(aes(x=n)) +
geom_col(aes(y=var/sum(var)), fill="lightblue") +
geom_text(aes(y=cumsum(var)/sum(var), label=round(cumsum(var)/sum(var), 2)), hjust=0, size=2.5) +
geom_line(aes(y=cumsum(var)/sum(var))) +
labs(x="Component", y="Variance Explained", title="Variance Explained (cumulative and incremental)")
A simple random forest is explored, for prediction of month:
# Simple random forest model
rfTempTrainMonth <- ranger::ranger(month ~ .,
data=tmpTempTrain[, c('month', varsTrain)],
importance = "impurity"
)
rfTempTrainMonth
## Ranger result
##
## Call:
## ranger::ranger(month ~ ., data = tmpTempTrain[, c("month", varsTrain)], importance = "impurity")
##
## Type: Classification
## Number of trees: 500
## Sample size: 88452
## Number of independent variables: 32
## Mtry: 5
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 0.27 %
# Variable importance
rfTempTrainMonth$variable.importance %>%
as.data.frame() %>%
purrr::set_names("imp") %>%
rownames_to_column("metric") %>%
tibble::as_tibble() %>%
ggplot(aes(x=fct_reorder(metric, imp), y=imp/1000)) +
geom_col(fill="lightblue") +
labs(x=NULL, y="Variable Importance (000)", title="Simple random forest to predict month") +
coord_flip()
# Performance on test data (confirm >99% accuracy)
rfTempTest <- tmpTempTest %>%
mutate(pred=predict(rfTempTrainMonth, data=.)$predictions)
cat("\nAccuracy on test dataset is: ", round(100*mean(rfTempTest$pred==rfTempTest$month), 2), "%\n", sep="")
##
## Accuracy on test dataset is: 99.72%
rfTempTest %>%
count(month, pred) %>%
ggplot(aes(x=pred, y=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
scale_fill_continuous("", low="white", high="green") +
labs(x="Predicted month", y="Actual month", title="Predicting month on test data")
The simple random forest has over 99% predictive accuracy on month, primarily focusing on metrics related to soil (soil temperature and soil moisture at various depths).
A portion of the predictive accuracy may be based on specific soil trends during a given year, as it is very unlikely that data consistently change right at 00h00 of a new month. Models are run using a holdout year:
# Simple random forest model, holding out 2022 data
rfTempHoldout <- ranger::ranger(month ~ .,
data=tmpTempTrain[year(tmpTempTrain$date) != 2022, c('month', varsTrain)],
importance = "impurity"
)
rfTempHoldout
## Ranger result
##
## Call:
## ranger::ranger(month ~ ., data = tmpTempTrain[year(tmpTempTrain$date) != 2022, c("month", varsTrain)], importance = "impurity")
##
## Type: Classification
## Number of trees: 500
## Sample size: 81908
## Number of independent variables: 32
## Mtry: 5
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 0.26 %
# Performance on holdout data
rfTempTest <- tmpTempTrain %>%
bind_rows(tmpTempTest) %>%
filter(year(date)==2022) %>%
mutate(pred=predict(rfTempHoldout, data=.)$predictions)
cat("\nAccuracy on holdout 2022 data is: ", round(100*mean(rfTempTest$pred==rfTempTest$month), 2), "%\n", sep="")
##
## Accuracy on holdout 2022 data is: 85.53%
rfTempTest %>%
count(month, pred) %>%
ggplot(aes(x=pred, y=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
scale_fill_continuous("", low="white", high="green") +
labs(x="Predicted month (2022)",
y="Actual month (2022)",
title="Applying random forest fit without 2022 data to 2022"
)
Without access to training data including 2022, the model still makes good predictions for 2022 month. But, predictions are commonly off by +/- 1 month leading to overall accuracy of 85% (vs. 99%+ when able to train on soil heating patterns in the given year)
The random forest is run using only the four most important variables:
# Simple random forest model, with only the four most important variables, holding out 2022 data
varsTop4 <- sort(rfTempHoldout$variable.importance, decreasing=TRUE)[1:4] %>% names
rfTempHoldTop4 <- ranger::ranger(month ~ .,
data=tmpTempTrain[year(tmpTempTrain$date) != 2022, c('month', varsTop4)],
importance = "impurity"
)
rfTempHoldTop4
## Ranger result
##
## Call:
## ranger::ranger(month ~ ., data = tmpTempTrain[year(tmpTempTrain$date) != 2022, c("month", varsTop4)], importance = "impurity")
##
## Type: Classification
## Number of trees: 500
## Sample size: 81908
## Number of independent variables: 4
## Mtry: 2
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 0.90 %
# Performance on holdout data
tmpPred <- tmpTempTrain %>%
bind_rows(tmpTempTest) %>%
filter(year(date)==2022) %>%
mutate(pred=predict(rfTempHoldTop4, data=.)$predictions)
cat("\nAccuracy on holdout 2022 data is: ",
round(100*mean(tmpPred$pred==tmpPred$month), 2),
"%\n",
sep=""
)
##
## Accuracy on holdout 2022 data is: 80.07%
tmpPred %>%
count(month, pred) %>%
ggplot(aes(x=pred, y=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
scale_fill_continuous("", low="white", high="green") +
labs(x="Predicted month (2022)",
y="Actual month (2022)",
title="Applying random forest fit without 2022 data to 2022",
subtitle="Most important 4 variables only"
)
Even with just 4 variables, the simple random forest retains 80% accuracy in predicting month for a holdout year, with all predictions being +/- 1 month of actual
Accuracy of predictions by day of month is explored:
# Full plot
tmpPred %>%
bind_rows(rfTempTest, .id="src") %>%
mutate(day=day(date), src=c("1"="Top-4", "2"="All")[src]) %>%
group_by(src, day) %>%
summarize(mu=mean(month==pred), .groups="drop") %>%
ggplot(aes(x=factor(day), y=mu)) +
geom_line(aes(group=src, color=src)) +
geom_point(aes(color=src), size=1) +
lims(y=c(0, 1)) +
labs(x="Day of month",
y="Accuracy of predicting month",
title="Accuracy of predicting month by day of month"
) +
scale_color_discrete("Features")
# Plot facetted by month
tmpPred %>%
bind_rows(rfTempTest, .id="src") %>%
mutate(day=day(date), src=c("1"="Top-4", "2"="All")[src]) %>%
group_by(src, day, month) %>%
summarize(mu=mean(month==pred), .groups="drop") %>%
ggplot(aes(x=factor(day), y=mu)) +
geom_line(aes(group=src, color=src)) +
geom_point(aes(color=src), size=1) +
lims(y=c(0, 1)) +
labs(x="Day of month",
y="Accuracy of predicting month",
title="Accuracy of predicting month by day of month"
) +
scale_color_discrete("Features") +
facet_wrap(~month)
Predictions near mid-month tend to be more accurate than predictions near the borders between months, consistent with soil temperatures gradually increasing or decreasing as seasons progress
A simple random forest is explored, for prediction of year:
# Simple random forest model
rfTempTrainYear <- ranger::ranger(fct_year ~ .,
data=tmpTempTrain %>%
mutate(fct_year=factor(year(date))) %>%
select(all_of(c("fct_year", 'month', varsTrain))),
importance = "impurity"
)
rfTempTrainYear
## Ranger result
##
## Call:
## ranger::ranger(fct_year ~ ., data = tmpTempTrain %>% mutate(fct_year = factor(year(date))) %>% select(all_of(c("fct_year", "month", varsTrain))), importance = "impurity")
##
## Type: Classification
## Number of trees: 500
## Sample size: 88452
## Number of independent variables: 33
## Mtry: 5
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 0.00 %
# Variable importance
rfTempTrainYear$variable.importance %>%
as.data.frame() %>%
purrr::set_names("imp") %>%
rownames_to_column("metric") %>%
tibble::as_tibble() %>%
ggplot(aes(x=fct_reorder(metric, imp), y=imp/1000)) +
geom_col(fill="lightblue") +
labs(x=NULL, y="Variable Importance (000)", title="Simple random forest to predict year") +
coord_flip()
# Performance on test data (confirm >99% accuracy)
rfTempTest <- tmpTempTest %>%
mutate(fct_year=factor(year(date)), pred=predict(rfTempTrainYear, data=.)$predictions)
cat("\nAccuracy on test dataset is: ", round(100*mean(rfTempTest$pred==rfTempTest$fct_year), 2), "%\n", sep="")
##
## Accuracy on test dataset is: 100%
rfTempTest %>%
count(fct_year, pred) %>%
ggplot(aes(x=pred, y=fct_year)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
scale_fill_continuous("", low="white", high="green") +
labs(x="Predicted year", y="Actual year", title="Predicting year on test data")
The years are sufficiently distinct that the random forest is able to separate them perfectly, with soil moisture being a primary explanatory variable
The random forest is re-run using only the four most important variables:
# Simple random forest model, with only the four most important variables
varsTop4 <- sort(rfTempTrainYear$variable.importance, decreasing=TRUE)[1:4] %>% names
rfTempYearTop4 <- ranger::ranger(fct_year ~ .,
data=tmpTempTrain %>%
mutate(fct_year=factor(year(date))) %>%
select(all_of(c("fct_year", varsTop4))),
importance = "impurity"
)
rfTempYearTop4
## Ranger result
##
## Call:
## ranger::ranger(fct_year ~ ., data = tmpTempTrain %>% mutate(fct_year = factor(year(date))) %>% select(all_of(c("fct_year", varsTop4))), importance = "impurity")
##
## Type: Classification
## Number of trees: 500
## Sample size: 88452
## Number of independent variables: 4
## Mtry: 2
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 0.13 %
# Variable importance
rfTempYearTop4$variable.importance %>%
as.data.frame() %>%
purrr::set_names("imp") %>%
rownames_to_column("metric") %>%
tibble::as_tibble() %>%
ggplot(aes(x=fct_reorder(metric, imp), y=imp/1000)) +
geom_col(fill="lightblue") +
labs(x=NULL,
y="Variable Importance (000)",
title="Simple random forest to predict year",
subtitle="Restricted to top-4 importance variables from previous forest") +
coord_flip()
# Performance on test data (confirm >99% accuracy)
rfTempTest <- tmpTempTest %>%
mutate(fct_year=factor(year(date)), pred=predict(rfTempYearTop4, data=.)$predictions)
cat("\nAccuracy on test dataset is: ", round(100*mean(rfTempTest$pred==rfTempTest$fct_year), 2), "%\n", sep="")
##
## Accuracy on test dataset is: 99.9%
rfTempTest %>%
count(fct_year, pred) %>%
ggplot(aes(x=pred, y=fct_year)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
scale_fill_continuous("", low="white", high="green") +
labs(x="Predicted year",
y="Actual year",
title="Predicting year on test data",
subtitle="Top-4 predictors by importance only"
)
Over the course of the 13.5-year training data, there is sufficient variation in soil temperature and moisture for the model to be assess year almost perfectly. Further exploration is needed for how the model is able to make clean distinctions between, for example, very late on December 31 and very early on January 1
Data are further explored for uniqueness of the four key variables:
# Full dataset of values, sorted
tmpTempFull <- tmpTempTrain %>%
bind_rows(tmpTempTest, .id="src") %>%
arrange(time) %>%
mutate(src=c("1"="Train", "2"="Test")[src], yyyymm=customYYYYMM(date), year=year(date))
tmpTempFull
## # A tibble: 117,936 × 90
## src time date hour temperat…¹ relat…² dewpo…³ appar…⁴
## <chr> <dttm> <date> <int> <dbl> <int> <dbl> <dbl>
## 1 Train 2010-01-01 00:00:00 2010-01-01 0 -9.5 67 -14.4 -15.8
## 2 Train 2010-01-01 01:00:00 2010-01-01 1 -9.8 69 -14.4 -16.3
## 3 Test 2010-01-01 02:00:00 2010-01-01 2 -10.3 73 -14.2 -16.8
## 4 Train 2010-01-01 03:00:00 2010-01-01 3 -10.8 74 -14.5 -17.2
## 5 Train 2010-01-01 04:00:00 2010-01-01 4 -11.3 75 -14.8 -17.7
## 6 Train 2010-01-01 05:00:00 2010-01-01 5 -11.8 76 -15.1 -18.2
## 7 Test 2010-01-01 06:00:00 2010-01-01 6 -12.3 77 -15.5 -18.6
## 8 Train 2010-01-01 07:00:00 2010-01-01 7 -12.8 78 -15.8 -19
## 9 Train 2010-01-01 08:00:00 2010-01-01 8 -13.2 79 -16.1 -19.4
## 10 Test 2010-01-01 09:00:00 2010-01-01 9 -13.4 78 -16.3 -19.6
## # … with 117,926 more rows, 82 more variables: pressure_msl <dbl>,
## # surface_pressure <dbl>, precipitation <dbl>, rain <dbl>, snowfall <dbl>,
## # cloudcover <int>, cloudcover_low <int>, cloudcover_mid <int>,
## # cloudcover_high <int>, shortwave_radiation <dbl>, direct_radiation <dbl>,
## # direct_normal_irradiance <dbl>, diffuse_radiation <dbl>,
## # windspeed_10m <dbl>, windspeed_100m <dbl>, winddirection_10m <int>,
## # winddirection_100m <int>, windgusts_10m <dbl>, …
# Number of combinations of percentile (top-4 variables)
tmpTempFull %>%
count(across(varsTop4), sort=TRUE)
## Warning: There was 1 warning in `count()`.
## ℹ In argument: `across(varsTop4)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(varsTop4)
##
## # Now:
## data %>% select(all_of(varsTop4))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## # A tibble: 13,279 × 5
## pct_soil_moisture_100_to_255cm pct_soil_moisture_28_t…¹ pct_s…² pct_s…³ n
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 2 2 99 73 146
## 2 18 31 97 88 131
## 3 8 1 94 78 108
## 4 3 2 99 98 99
## 5 65 49 4 3 96
## 6 3 3 100 99 92
## 7 5 0 98 96 78
## 8 17 6 98 82 75
## 9 33 17 88 77 75
## 10 9 4 88 66 74
## # … with 13,269 more rows, and abbreviated variable names
## # ¹pct_soil_moisture_28_to_100cm, ²pct_soil_temperature_100_to_255cm,
## # ³pct_soil_temperature_28_to_100cm
# Number of combinations of percentile (top-4 variables) and year
tmpTempFull %>%
count(across(c(varsTop4, "year")), sort=TRUE)
## # A tibble: 13,300 × 6
## pct_soil_moisture_100_to_255cm pct_soil_moistur…¹ pct_s…² pct_s…³ year n
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2 2 99 73 2012 146
## 2 18 31 97 88 2016 131
## 3 8 1 94 78 2013 108
## 4 3 2 99 98 2012 99
## 5 65 49 4 3 2010 96
## 6 3 3 100 99 2012 92
## 7 5 0 98 96 2012 78
## 8 17 6 98 82 2021 75
## 9 33 17 88 77 2015 75
## 10 9 4 88 66 2022 74
## # … with 13,290 more rows, and abbreviated variable names
## # ¹pct_soil_moisture_28_to_100cm, ²pct_soil_temperature_100_to_255cm,
## # ³pct_soil_temperature_28_to_100cm
# Number of combinations of percentile (top-4 variables) and year-month
tmpTempFull %>%
count(across(c(varsTop4, "yyyymm")), sort=TRUE)
## # A tibble: 13,439 × 6
## pct_soil_moisture_100_to_255cm pct_soil_moistu…¹ pct_s…² pct_s…³ yyyymm n
## <dbl> <dbl> <dbl> <dbl> <chr> <int>
## 1 18 31 97 88 2016-… 131
## 2 65 49 4 3 2010-… 96
## 3 2 2 99 73 2012-… 92
## 4 3 3 100 99 2012-… 92
## 5 5 0 98 96 2012-… 78
## 6 17 6 98 82 2021-… 75
## 7 33 17 88 77 2015-… 75
## 8 9 4 88 66 2022-… 74
## 9 43 42 34 45 2021-… 71
## 10 4 0 98 95 2012-… 66
## # … with 13,429 more rows, and abbreviated variable names
## # ¹pct_soil_moisture_28_to_100cm, ²pct_soil_temperature_100_to_255cm,
## # ³pct_soil_temperature_28_to_100cm
# Plot of moisture and temperature
tmpTempFull %>%
mutate(fct_year=factor(year)) %>%
count(x255=pct_soil_moisture_100_to_255cm, y100=pct_soil_moisture_28_to_100cm, fct_year, month) %>%
filter(fct_year %in% 2014:2017) %>%
ggplot() +
geom_point(aes(x=x255, y=y100, color=month, size=n)) +
facet_wrap(~fct_year) +
labs(x="Soil moisture percentile (100-255 cm)",
y="Soil moisture percentile (28-100 cm)",
title="Soil moisture patterns by year"
)
Of the 117,936 observations, there are 13,279 combinations of percentile for the top-4 variables. Only a very few combinations span across different years (21) or even months (160), explaining the very high explanatory power of the model. Forward-looking predictive power is likely to be very poor, as the model needs to be trained on the specific patterns of soil moisture and temperature that evolved in a given year. But, specific patterns of observed soil moisture appear to be a characteristic signature of a specific year in the training data
The model is re-run predicting year with all variables except for the top-4:
# Simple random forest model, excluding the four most important variables
varsNonTop4 <- sort(rfTempTrainYear$variable.importance, decreasing=TRUE)[-c(1:4)] %>% names
rfTempYearNonTop4 <- ranger::ranger(fct_year ~ .,
data=tmpTempTrain %>%
mutate(fct_year=factor(year(date))) %>%
select(all_of(c("fct_year", varsNonTop4))),
importance = "impurity"
)
## Growing trees.. Progress: 64%. Estimated remaining time: 17 seconds.
rfTempYearNonTop4
## Ranger result
##
## Call:
## ranger::ranger(fct_year ~ ., data = tmpTempTrain %>% mutate(fct_year = factor(year(date))) %>% select(all_of(c("fct_year", varsNonTop4))), importance = "impurity")
##
## Type: Classification
## Number of trees: 500
## Sample size: 88452
## Number of independent variables: 29
## Mtry: 5
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 1.65 %
# Variable importance
rfTempYearNonTop4$variable.importance %>%
as.data.frame() %>%
purrr::set_names("imp") %>%
rownames_to_column("metric") %>%
tibble::as_tibble() %>%
ggplot(aes(x=fct_reorder(metric, imp), y=imp/1000)) +
geom_col(fill="lightblue") +
labs(x=NULL,
y="Variable Importance (000)",
title="Simple random forest to predict year",
subtitle="Excludes top-4 importance variables from full forest") +
coord_flip()
# Performance on test data (confirm >99% accuracy)
rfTempTest <- tmpTempTest %>%
mutate(fct_year=factor(year(date)), pred=predict(rfTempYearNonTop4, data=.)$predictions)
cat("\nAccuracy on test dataset is: ", round(100*mean(rfTempTest$pred==rfTempTest$fct_year), 2), "%\n", sep="")
##
## Accuracy on test dataset is: 98.47%
rfTempTest %>%
count(fct_year, pred) %>%
ggplot(aes(x=pred, y=fct_year)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
scale_fill_continuous("", low="white", high="green") +
labs(x="Predicted year",
y="Actual year",
title="Predicting year on test data",
subtitle="Excludes top-4 predictors by importance"
)
There is still sufficient annual difference in the data to effectively determine year based on explanatory variables excluding the top-4 in importance
The model is re-run for predicting month with all variables except for the top-4:
# Simple random forest model, excluding the four most important variables
varsNonTop4 <- sort(rfTempTrainMonth$variable.importance, decreasing=TRUE)[-c(1:4)] %>% names
rfTempMonthNonTop4 <- ranger::ranger(month ~ .,
data=tmpTempTrain %>%
mutate(fct_year=factor(year(date))) %>%
select(all_of(c("month", varsNonTop4))),
importance = "impurity"
)
## Growing trees.. Progress: 69%. Estimated remaining time: 13 seconds.
rfTempMonthNonTop4
## Ranger result
##
## Call:
## ranger::ranger(month ~ ., data = tmpTempTrain %>% mutate(fct_year = factor(year(date))) %>% select(all_of(c("month", varsNonTop4))), importance = "impurity")
##
## Type: Classification
## Number of trees: 500
## Sample size: 88452
## Number of independent variables: 28
## Mtry: 5
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 4.61 %
# Variable importance
rfTempMonthNonTop4$variable.importance %>%
as.data.frame() %>%
purrr::set_names("imp") %>%
rownames_to_column("metric") %>%
tibble::as_tibble() %>%
ggplot(aes(x=fct_reorder(metric, imp), y=imp/1000)) +
geom_col(fill="lightblue") +
labs(x=NULL,
y="Variable Importance (000)",
title="Simple random forest to predict month",
subtitle="Excludes top-4 importance variables from full forest") +
coord_flip()
# Performance on test data (confirm >95% accuracy)
rfTempTest <- tmpTempTest %>%
mutate(fct_year=factor(year(date)), pred=predict(rfTempMonthNonTop4, data=.)$predictions)
cat("\nAccuracy on test dataset is: ", round(100*mean(rfTempTest$pred==rfTempTest$month), 2), "%\n", sep="")
##
## Accuracy on test dataset is: 95.46%
rfTempTest %>%
count(month, pred) %>%
ggplot(aes(x=pred, y=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
scale_fill_continuous("", low="white", high="green") +
labs(x="Predicted month",
y="Actual month",
title="Predicting month on test data",
subtitle="Excludes top-4 predictors by importance"
)
There is still sufficient annual difference in the data to effectively determine month based on explanatory variables excluding the top-4 in importance
The random forest is re-run excluding the four most important variables, with a holdout year:
# Simple random forest model, excluding the four most important variables, holding out 2022 data
varsNonTop4 <- sort(rfTempTrainMonth$variable.importance, decreasing=TRUE)[-c(1:4)] %>% names
rfTempHoldNonTop4 <- ranger::ranger(month ~ .,
data=tmpTempTrain %>%
mutate(fct_year=factor(year(date))) %>%
filter(year(date)<2022) %>%
select(all_of(c("month", varsNonTop4))),
importance = "impurity"
)
## Growing trees.. Progress: 76%. Estimated remaining time: 9 seconds.
rfTempHoldNonTop4
## Ranger result
##
## Call:
## ranger::ranger(month ~ ., data = tmpTempTrain %>% mutate(fct_year = factor(year(date))) %>% filter(year(date) < 2022) %>% select(all_of(c("month", varsNonTop4))), importance = "impurity")
##
## Type: Classification
## Number of trees: 500
## Sample size: 78915
## Number of independent variables: 28
## Mtry: 5
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 4.56 %
# Performance on holdout data
tmpPred <- tmpTempTrain %>%
bind_rows(tmpTempTest) %>%
filter(year(date)==2022) %>%
mutate(pred=predict(rfTempHoldNonTop4, data=.)$predictions)
cat("\nAccuracy on holdout 2022 data is: ",
round(100*mean(tmpPred$pred==tmpPred$month), 2),
"%\n",
sep=""
)
##
## Accuracy on holdout 2022 data is: 52.79%
tmpPred %>%
count(month, pred) %>%
ggplot(aes(x=pred, y=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
scale_fill_continuous("", low="white", high="green") +
labs(x="Predicted month (2022)",
y="Actual month (2022)",
title="Applying random forest fit without 2022 data to 2022",
subtitle="Most important 4 variables only"
)
tmpPred %>%
select(month, pred) %>%
mutate(across(.cols=everything(), as.integer), delta=((month-pred+6)%%12)-6) %>%
ggplot(aes(x=delta)) +
geom_bar(fill="lightblue") +
labs(title="Difference in months (predicted vs. actual)",
x="Difference in months",
y="Number"
)
Excluding top-4 variables, the model successfully memorizes patterns, but is less successful in generalizing for forward-looking predictions. Predictions on a future year are ~50% accurate, compared with ~95% accuracy for predictions on unseen data in modeled years. This suggests high autocorrelation among data elements, such that an unseen data point at 12h00 is very similar to seen data points at 11h00 and 13h00, and similar (though less so) to seen data points at 12h00 exactly 1 year ago and/or 1 year in the future. Predictions on an unseen year are usually within +/- 1 month of actual, so the model is learning generalized trends about seasons
The random forest regression is run for predicting temperature:
# Variables to include for modeling
varsTemp <- tmpTempTrain %>%
select(-matches("pct_\\d{4}$"), -pct_temperature_2m, -pct_weathercode) %>%
select(starts_with("pct_")) %>%
names
varsTemp
## [1] "pct_hour" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_vapor_pressure_deficit"
## [25] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [27] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [29] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [31] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
# Simple random forest model, excluding the four most important variables, holding out 2022 data
rfTempTemp <- ranger::ranger(temperature_2m ~ .,
data=tmpTempTrain %>%
select(all_of(c("temperature_2m", varsTemp))),
importance = "impurity"
)
## Growing trees.. Progress: 63%. Estimated remaining time: 18 seconds.
rfTempTemp
## Ranger result
##
## Call:
## ranger::ranger(temperature_2m ~ ., data = tmpTempTrain %>% select(all_of(c("temperature_2m", varsTemp))), importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 88452
## Number of independent variables: 32
## Mtry: 5
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 0.07391882
## R squared (OOB): 0.9993964
# Variable importance
rfTempTemp$variable.importance %>%
as.data.frame() %>%
purrr::set_names("imp") %>%
rownames_to_column("metric") %>%
tibble::as_tibble() %>%
ggplot(aes(x=fct_reorder(metric, imp), y=imp/1000)) +
geom_col(fill="lightblue") +
labs(x=NULL,
y="Variable Importance (000)",
title="Simple random forest to predict temperature"
) +
coord_flip()
# Performance on test data (confirm very low error)
rfTempTest <- tmpTempTest %>%
mutate(pred=predict(rfTempTemp, data=.)$predictions)
cat("\nMSE on test dataset is: ", round(mean((rfTempTest$pred-rfTempTest$temperature_2m)**2), 3), "\n", sep="")
##
## MSE on test dataset is: 0.07
rfTempTest %>%
count(ractual=round(temperature_2m, 1), rpred=round(pred, 1)) %>%
ggplot(aes(x=ractual, y=rpred)) +
geom_point(aes(size=n)) +
geom_smooth(aes(weight=n), method="lm") +
scale_size_continuous("") +
labs(x="Actual temperature",
y="Predicted temperature",
title="Applying random forest regression for temperature"
)
## `geom_smooth()` using formula = 'y ~ x'
Many variables are strongly correlated, making temperature a simple prediction. Apparent temperature in particular is derived from dewpoint and temperature
The random forest regression is re-run for predicting temperature, with 2022-2023 as holdout years:
# Variables to include for modeling
varsTemp <- tmpTempTrain %>%
select(-matches("pct_\\d{4}$"), -pct_temperature_2m, -pct_weathercode) %>%
select(starts_with("pct_")) %>%
names
varsTemp
## [1] "pct_hour" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_vapor_pressure_deficit"
## [25] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [27] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [29] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [31] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
# Simple random forest model, excluding the four most important variables, holding out 2022 data
rfTempTempHoldout <- ranger::ranger(temperature_2m ~ .,
data=tmpTempTrain %>%
filter(year(date)<2022) %>%
select(all_of(c("temperature_2m", varsTemp))),
importance = "impurity"
)
## Growing trees.. Progress: 64%. Estimated remaining time: 17 seconds.
rfTempTempHoldout
## Ranger result
##
## Call:
## ranger::ranger(temperature_2m ~ ., data = tmpTempTrain %>% filter(year(date) < 2022) %>% select(all_of(c("temperature_2m", varsTemp))), importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 78915
## Number of independent variables: 32
## Mtry: 5
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 0.07712243
## R squared (OOB): 0.9993774
# Performance on test data (confirm very low error)
rfTempTest <- tmpTempTest %>%
mutate(pred=predict(rfTempTempHoldout, data=.)$predictions, year=year(date), delta=temperature_2m-pred)
cat("\nMSE on test dataset for 2022-2023 (holdout years) is: ",
round(mean(rfTempTest[rfTempTest$year>=2022,]$delta**2), 3),
"\n",
sep=""
)
##
## MSE on test dataset for 2022-2023 (holdout years) is: 0.224
# Plot of MSE by year
rfTempTest %>%
group_by(year) %>%
summarize(mse=mean(delta**2)) %>%
ggplot(aes(x=factor(year))) +
geom_col(aes(y=mse), fill="lightblue") +
geom_text(aes(y=mse/2, label=round(mse,2))) +
labs(x=NULL, y="MSE", title="MSE of temperature predictions (modeled using 2021 and prior data)")
# Plot of predicted vs. actual temperature in holdout years
rfTempTest %>%
filter(year>=2022) %>%
count(ractual=round(temperature_2m, 1), rpred=round(pred, 1)) %>%
ggplot(aes(x=ractual, y=rpred)) +
geom_point(aes(size=n)) +
geom_smooth(aes(weight=n), method="lm") +
scale_size_continuous("") +
labs(x="Actual temperature",
y="Predicted temperature",
title="Applying random forest regression for temperature",
subtitle="Holdout years 2022-2023 plotted (modeled on 2021 and prior)"
)
## `geom_smooth()` using formula = 'y ~ x'
Given correlations among the variables, predictions remain very accurate in the holdout years, though less accurate than in years where the model has been able to see nearby data
The random forest regression is re-run for predicting temperature, with only the top 4 predictors, and with 2022-2023 as holdout years:
# Variables to include for modeling
varsTop4 <- sort(rfTempTempHoldout$variable.importance, decreasing=TRUE)[c(1:4)] %>% names
# Simple random forest model, excluding the four most important variables, holding out 2022 data
rfTempTempTop4Holdout <- ranger::ranger(temperature_2m ~ .,
data=tmpTempTrain %>%
filter(year(date)<2022) %>%
select(all_of(c("temperature_2m", varsTop4))),
importance = "impurity"
)
rfTempTempTop4Holdout
## Ranger result
##
## Call:
## ranger::ranger(temperature_2m ~ ., data = tmpTempTrain %>% filter(year(date) < 2022) %>% select(all_of(c("temperature_2m", varsTop4))), importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 78915
## Number of independent variables: 4
## Mtry: 2
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 0.6876289
## R squared (OOB): 0.9944492
# Performance on test data (confirm very low error)
rfTempTest <- tmpTempTest %>%
mutate(pred=predict(rfTempTempTop4Holdout, data=.)$predictions,
year=year(date),
delta=temperature_2m-pred
)
cat("\nMSE on test dataset for 2022-2023 (holdout years) is: ",
round(mean(rfTempTest[rfTempTest$year>=2022,]$delta**2), 3),
"\n",
sep=""
)
##
## MSE on test dataset for 2022-2023 (holdout years) is: 0.935
# Plot of MSE by year
rfTempTest %>%
group_by(year) %>%
summarize(mse=mean(delta**2)) %>%
ggplot(aes(x=factor(year))) +
geom_col(aes(y=mse), fill="lightblue") +
geom_text(aes(y=mse/2, label=round(mse,2))) +
labs(x=NULL,
y="MSE",
title="MSE of temperature predictions (modeled using 2021 and prior data)",
subtitle="Top 4 predictors only"
)
# Plot of predicted vs. actual temperature in holdout years
rfTempTest %>%
filter(year>=2022) %>%
count(ractual=round(temperature_2m, 1), rpred=round(pred, 1)) %>%
ggplot(aes(x=ractual, y=rpred)) +
geom_point(aes(size=n)) +
geom_smooth(aes(weight=n), method="lm") +
scale_size_continuous("") +
labs(x="Actual temperature",
y="Predicted temperature",
title="Applying random forest regression for temperature (top-4 variables only)",
subtitle="Holdout years 2022-2023 plotted (modeled on 2021 and prior)"
)
## `geom_smooth()` using formula = 'y ~ x'
With only the top-4 variables, MSE on the training dataset increases, while ratio between MSE for the holdout data and MSE for the training data decreases